Date: (Sun) Jun 21, 2015
Data: Source: Training: https://courses.edx.org/asset-v1:MITx+15.071x_2a+2T2015+type@asset+block/climate_change.csv
New:
Time period:
Based on analysis utilizing <> techniques,
Regression results: First run:
Classification results: First run:
Use plot.ly for interactive plots ?
varImp for randomForest crashes in caret version:6.0.41 -> submit bug report
extensions toward multiclass classification are scheduled for the next release
glm_dmy_mdl should use the same method as glm_sel_mdl until custom dummy classifer is implemented
rm(list=ls())
set.seed(12345)
options(stringsAsFactors=FALSE)
source("~/Dropbox/datascience/R/myscript.R")
source("~/Dropbox/datascience/R/mydsutils.R")
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
source("~/Dropbox/datascience/R/myplot.R")
source("~/Dropbox/datascience/R/mypetrinet.R")
source("~/Dropbox/datascience/R/myplclust.R")
# Gather all package requirements here
suppressPackageStartupMessages(require(doMC))
registerDoMC(4) # max(length(glb_txt_vars), glb_n_cv_folds) + 1
#packageVersion("snow")
#require(sos); findFn("cosine", maxPages=2, sortby="MaxScore")
# Analysis control global variables
glb_trnng_url <- "https://courses.edx.org/asset-v1:MITx+15.071x_2a+2T2015+type@asset+block/climate_change.csv"
glb_newdt_url <- "<newdt_url>"
glb_out_pfx <- "template2_"
glb_save_envir <- FALSE # or TRUE
glb_is_separate_newobs_dataset <- FALSE # or TRUE
glb_split_entity_newobs_datasets <- TRUE # or FALSE
glb_split_newdata_method <- "condition" # "condition" or "sample" or "copy"
glb_split_newdata_condition <- "Year > 2006"
glb_split_newdata_size_ratio <- 0.3 # > 0 & < 1
glb_split_sample.seed <- 123 # or any integer
glb_max_fitobs <- NULL # or any integer
glb_is_regression <- TRUE; glb_is_classification <- !glb_is_regression;
glb_is_binomial <- NULL # or TRUE or FALSE
glb_rsp_var_raw <- "Temp"
# for classification, the response variable has to be a factor
glb_rsp_var <- glb_rsp_var_raw # or "Temp.fctr"
# if the response factor is based on numbers/logicals e.g (0/1 OR TRUE/FALSE vs. "A"/"B"),
# or contains spaces (e.g. "Not in Labor Force")
# caret predict(..., type="prob") crashes
glb_map_rsp_raw_to_var <- NULL # or function(raw) {
# ret_vals <- rep_len(NA, length(raw)); ret_vals[!is.na(raw)] <- ifelse(raw[!is.na(raw)] == 1, "Y", "N"); return(relevel(as.factor(ret_vals), ref="N"))
# #as.factor(paste0("B", raw))
# #as.factor(gsub(" ", "\\.", raw))
# }
# glb_map_rsp_raw_to_var(c(1, 1, 0, 0, 0))
glb_map_rsp_var_to_raw <- NULL # or function(var) {
# as.numeric(var) - 1
# #as.numeric(var)
# #gsub("\\.", " ", levels(var)[as.numeric(var)])
# c("<=50K", " >50K")[as.numeric(var)]
# #c(FALSE, TRUE)[as.numeric(var)]
# }
# glb_map_rsp_var_to_raw(glb_map_rsp_raw_to_var(c(1, 1, 0, 0, 0)))
if ((glb_rsp_var != glb_rsp_var_raw) & is.null(glb_map_rsp_raw_to_var))
stop("glb_map_rsp_raw_to_var function expected")
glb_rsp_var_out <- paste0(glb_rsp_var, ".predict.") # model_id is appended later
# List info gathered for various columns
# <col_name>: <description>; <notes>
# Year: the observation year.
# Month: the observation month.
# Temp: the difference in degrees Celsius between the average global temperature in that period and a reference value. This data comes from the Climatic Research Unit at the University of East Anglia.
# CO2, N2O, CH4, CFC.11, CFC.12: atmospheric concentrations of carbon dioxide (CO2), nitrous oxide (N2O), methane (CH4), trichlorofluoromethane (CCl3F; commonly referred to as CFC-11) and dichlorodifluoromethane (CCl2F2; commonly referred to as CFC-12), respectively. This data comes from the ESRL/NOAA Global Monitoring Division.
# CO2, N2O and CH4 are expressed in ppmv (parts per million by volume -- i.e., 397 ppmv of CO2 means that CO2 constitutes 397 millionths of the total volume of the atmosphere)
# CFC.11 and CFC.12 are expressed in ppbv (parts per billion by volume).
# Aerosols: the mean stratospheric aerosol optical depth at 550 nm. This variable is linked to volcanoes, as volcanic eruptions result in new particles being added to the atmosphere, which affect how much of the sun's energy is reflected back into space. This data is from the Godard Institute for Space Studies at NASA.
# TSI: the total solar irradiance (TSI) in W/m2 (the rate at which the sun's energy is deposited per unit area). Due to sunspots and other solar phenomena, the amount of energy that is given off by the sun varies substantially with time. This data is from the SOLARIS-HEPPA project website.
# MEI: multivariate El Nino Southern Oscillation index (MEI), a measure of the strength of the El Nino/La Nina-Southern Oscillation (a weather effect in the Pacific Ocean that affects global temperatures). This data comes from the ESRL/NOAA Physical Sciences Division.
# If multiple vars are parts of id, consider concatenating them to create one id var
# If glb_id_var == NULL, ".rownames <- row.names()" is the default
glb_id_var <- NULL # or c("<var1>")
glb_category_vars <- NULL # or c("<var1>", "<var2>")
glb_drop_vars <- c(NULL) # or c("<col_name>")
glb_map_vars <- NULL # or c("<var1>", "<var2>")
glb_map_urls <- list();
# glb_map_urls[["<var1>"]] <- "<var1.url>"
glb_assign_pairs_lst <- NULL;
# glb_assign_pairs_lst[["<var1>"]] <- list(from=c(NA),
# to=c("NA.my"))
glb_assign_vars <- names(glb_assign_pairs_lst)
glb_transform_lst <- NULL;
# glb_transform_lst[["<var>"]] <- list(
# mapfn=function(raw) { tfr_raw <- as.character(cut(raw, 5));
# tfr_raw[is.na(tfr_raw)] <- "NA.my";
# return(as.factor(tfr_raw)) }
# , sfx=".my.fctr")
# mapfn=function(raw) { mod_raw <- raw;
# mod_raw <- gsub("&#[[:digit:]]{3};", " ", mod_raw);
# # Modifications for this exercise only
# mod_raw <- gsub("\\bgoodIn ", "good In", mod_raw);
# return(mod_raw)
# }
# , sfx=".my")
# mapfn(glb_allobs_df$review[739])
# mapfn(glb_allobs_df$review[740])
# ret_lst <- regexec("&#[[:digit:]]{3};", raw, ignore.case=FALSE); ret_lst <- regmatches(raw, ret_lst); ret_vctr <- sapply(1:length(ret_lst), function(pos_ix) ifelse(length(ret_lst[[pos_ix]]) > 0, ret_lst[[pos_ix]], "")); print(ret_vctr <- ret_vctr[ret_vctr != ""])
# ret_lst <- gregexpr("&#[[:digit:]]{3};", raw, ignore.case=FALSE); print(ret_lst); print(length(ret_lst[[1]]))
# mapfn(glb_allobs_df$review)
# mapfn(glb_allobs_df$review[739:740])
# mapfn(glb_allobs_df$<var>)
# glb_transform_lst[["<var1>"]] <- glb_transform_lst[["<var2>"]]
# Add logs of numerics that are not distributed normally -> do automatically ???
glb_transform_vars <- names(glb_transform_lst)
glb_date_vars <- NULL # or c("<date_var>")
glb_date_fmts <- list(); #glb_date_fmts[["<date_var>"]] <- "%m/%e/%y"
glb_date_tzs <- list(); #glb_date_tzs[["<date_var>"]] <- "America/New_York"
#grep("America/New", OlsonNames(), value=TRUE)
glb_txt_vars <- NULL # or c("<txt_var1>", "<txt_var2>")
#Sys.setlocale("LC_ALL", "C") # For english
glb_append_stop_words <- list()
# Remember to use unstemmed words
#orderBy(~ -cor.y.abs, subset(glb_feats_df, grepl("[HSA]\\.T\\.", id) & !is.na(cor.high.X)))
#dsp_obs(Headline.contains="polit")
#subset(glb_allobs_df, H.T.compani > 0)[, c("UniqueID", "Headline", "H.T.compani")]
# glb_append_stop_words[["<txt_var1>"]] <- c(NULL
# # ,"<word1>" # <reason1>
# )
#subset(glb_allobs_df, S.T.newyorktim > 0)[, c("UniqueID", "Snippet", "S.T.newyorktim")]
#glb_txt_lst[["Snippet"]][which(glb_allobs_df$UniqueID %in% c(8394, 8317, 8339, 8350, 8307))]
glb_important_terms <- list()
# Remember to use stemmed terms
glb_sprs_thresholds <- NULL # or c(0.988, 0.970, 0.970) # Generates 29, 22, 22 terms
# Properties:
# numrows(glb_feats_df) << numrows(glb_fitobs_df)
# Select terms that appear in at least 0.2 * O(FP/FN(glb_OOBobs_df))
# numrows(glb_OOBobs_df) = 1.1 * numrows(glb_newobs_df)
names(glb_sprs_thresholds) <- glb_txt_vars
# Derived features (consolidate this with transform features ???)
glb_derive_lst <- NULL;
# glb_derive_lst[["PTS.diff"]] <- list(
# mapfn=function(PTS, oppPTS) { return(PTS - oppPTS) }
# , args=c("PTS", "oppPTS"))
# glb_derive_lst[["<txt_var>.niso8859.log"]] <- list(
# mapfn=function(<txt_var>) { match_lst <- gregexpr("&#[[:digit:]]{3};", <txt_var>)
# match_num_vctr <- unlist(lapply(match_lst,
# function(elem) length(elem)))
# return(log(1 + match_num_vctr)) }
# , args=c("<txt_var>"))
# args_lst <- NULL; for (arg in glb_derive_lst[["PTS.diff"]]$args) args_lst[[arg]] <- glb_allobs_df[, arg]; do.call(mapfn, args_lst)
# glb_derive_lst[["<var1>"]] <- glb_derive_lst[["<var2>"]]
# # Create user-specified pattern vectors
# #sum(mycount_pattern_occ("Metropolitan Diary:", glb_allobs_df$Abstract) > 0)
# if (txt_var %in% c("Snippet", "Abstract")) {
# txt_X_df[, paste0(txt_var_pfx, ".P.metropolitan.diary.colon")] <-
# as.integer(0 + mycount_pattern_occ("Metropolitan Diary:",
# glb_allobs_df[, txt_var]))
#summary(glb_allobs_df[ ,grep("P.on.this.day", names(glb_allobs_df), value=TRUE)])
glb_derive_vars <- names(glb_derive_lst)
# User-specified exclusions
glb_exclude_vars_as_features <- NULL # or c("<var_name>")
if (glb_rsp_var_raw != glb_rsp_var)
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
glb_rsp_var_raw)
# List feats that shd be excluded due to known causation by prediction variable
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
c(NULL)) # or c("<col_name>")
glb_impute_na_data <- FALSE # or TRUE
glb_mice_complete.seed <- 144 # or any integer
glb_cluster <- FALSE # or TRUE
glb_interaction_only_features <- NULL # or ???
glb_models_lst <- list(); glb_models_df <- data.frame()
# Regression
if (glb_is_regression)
glb_models_method_vctr <- c("lm", "glm", "bayesglm", "rpart", "rf") else
# Classification
if (glb_is_binomial)
glb_models_method_vctr <- c("glm", "bayesglm", "rpart", "rf") else
glb_models_method_vctr <- c("rpart", "rf")
# Baseline prediction model feature(s)
glb_Baseline_mdl_var <- NULL # or c("<col_name>")
glb_model_metric_terms <- NULL # or matrix(c(
# 0,1,2,3,4,
# 2,0,1,2,3,
# 4,2,0,1,2,
# 6,4,2,0,1,
# 8,6,4,2,0
# ), byrow=TRUE, nrow=5)
glb_model_metric <- NULL # or "<metric_name>"
glb_model_metric_maximize <- NULL # or FALSE (TRUE is not the default for both classification & regression)
glb_model_metric_smmry <- NULL # or function(data, lev=NULL, model=NULL) {
# confusion_mtrx <- t(as.matrix(confusionMatrix(data$pred, data$obs)))
# #print(confusion_mtrx)
# #print(confusion_mtrx * glb_model_metric_terms)
# metric <- sum(confusion_mtrx * glb_model_metric_terms) / nrow(data)
# names(metric) <- glb_model_metric
# return(metric)
# }
glb_tune_models_df <-
rbind(
#data.frame(parameter="cp", min=0.00005, max=0.00005, by=0.000005),
#seq(from=0.01, to=0.01, by=0.01)
#data.frame(parameter="mtry", min=080, max=100, by=10),
#data.frame(parameter="mtry", min=08, max=10, by=1),
data.frame(parameter="dummy", min=2, max=4, by=1)
)
# or NULL
glb_n_cv_folds <- 3 # or NULL
glb_clf_proba_threshold <- NULL # 0.5
# Model selection criteria
if (glb_is_regression)
glb_model_evl_criteria <- c("min.RMSE.OOB", "max.R.sq.OOB", "max.Adj.R.sq.fit")
if (glb_is_classification) {
if (glb_is_binomial)
glb_model_evl_criteria <-
c("max.Accuracy.OOB", "max.auc.OOB", "max.Kappa.OOB", "min.aic.fit") else
glb_model_evl_criteria <- c("max.Accuracy.OOB", "max.Kappa.OOB")
}
glb_sel_mdl_id <- NULL # or "<model_id_prefix>.<model_method>"
glb_fin_mdl_id <- glb_sel_mdl_id # or "Final"
# Depict process
glb_analytics_pn <- petrinet(name="glb_analytics_pn",
trans_df=data.frame(id=1:6,
name=c("data.training.all","data.new",
"model.selected","model.final",
"data.training.all.prediction","data.new.prediction"),
x=c( -5,-5,-15,-25,-25,-35),
y=c( -5, 5, 0, 0, -5, 5)
),
places_df=data.frame(id=1:4,
name=c("bgn","fit.data.training.all","predict.data.new","end"),
x=c( -0, -20, -30, -40),
y=c( 0, 0, 0, 0),
M0=c( 3, 0, 0, 0)
),
arcs_df=data.frame(
begin=c("bgn","bgn","bgn",
"data.training.all","model.selected","fit.data.training.all",
"fit.data.training.all","model.final",
"data.new","predict.data.new",
"data.training.all.prediction","data.new.prediction"),
end =c("data.training.all","data.new","model.selected",
"fit.data.training.all","fit.data.training.all","model.final",
"data.training.all.prediction","predict.data.new",
"predict.data.new","data.new.prediction",
"end","end")
))
#print(ggplot.petrinet(glb_analytics_pn))
print(ggplot.petrinet(glb_analytics_pn) + coord_flip())
## Loading required package: grid
glb_analytics_avl_objs <- NULL
glb_chunks_df <- myadd_chunk(NULL, "import.data")
## label step_major step_minor bgn end elapsed
## 1 import.data 1 0 9.5 NA NA
1.0: import data#glb_chunks_df <- myadd_chunk(NULL, "import.data")
glb_trnobs_df <- myimport_data(url=glb_trnng_url, comment="glb_trnobs_df",
force_header=TRUE)
## [1] "Reading file ./data/climate_change.csv..."
## [1] "dimensions of data in ./data/climate_change.csv: 308 rows x 11 cols"
## Year Month MEI CO2 CH4 N2O CFC.11 CFC.12 TSI
## 1 1983 5 2.556 345.96 1638.59 303.677 191.324 350.113 1366.102
## 2 1983 6 2.167 345.52 1633.71 303.746 192.057 351.848 1366.121
## 3 1983 7 1.741 344.15 1633.22 303.795 192.818 353.725 1366.285
## 4 1983 8 1.130 342.25 1631.35 303.839 193.602 355.633 1366.420
## 5 1983 9 0.428 340.17 1648.40 303.901 194.392 357.465 1366.234
## 6 1983 10 0.002 340.30 1663.79 303.970 195.171 359.174 1366.059
## Aerosols Temp
## 1 0.0863 0.109
## 2 0.0794 0.118
## 3 0.0731 0.137
## 4 0.0673 0.176
## 5 0.0619 0.149
## 6 0.0569 0.093
## Year Month MEI CO2 CH4 N2O CFC.11 CFC.12 TSI
## 11 1984 3 0.131 345.46 1655.77 304.285 198.249 365.044 1366.170
## 47 1987 3 1.724 349.72 1695.08 305.812 231.325 416.063 1365.669
## 157 1996 5 -0.178 365.16 1754.78 311.533 269.456 528.449 1365.624
## 223 2001 11 -0.180 369.69 1793.55 316.729 260.434 543.226 1366.520
## 224 2001 12 0.003 371.18 1796.58 316.883 260.432 543.419 1366.891
## 303 2008 7 0.003 386.42 1782.93 321.372 244.434 535.026 1365.672
## Aerosols Temp
## 11 0.0383 0.049
## 47 0.0109 0.021
## 157 0.0067 0.177
## 223 0.0021 0.491
## 224 0.0022 0.323
## 303 0.0033 0.406
## Year Month MEI CO2 CH4 N2O CFC.11 CFC.12 TSI
## 303 2008 7 0.003 386.42 1782.93 321.372 244.434 535.026 1365.672
## 304 2008 8 -0.266 384.15 1779.88 321.405 244.200 535.072 1365.657
## 305 2008 9 -0.643 383.09 1795.08 321.529 244.083 535.048 1365.665
## 306 2008 10 -0.780 382.99 1814.18 321.796 244.080 534.927 1365.676
## 307 2008 11 -0.621 384.13 1812.37 322.013 244.225 534.906 1365.707
## 308 2008 12 -0.666 385.56 1812.88 322.182 244.204 535.005 1365.693
## Aerosols Temp
## 303 0.0033 0.406
## 304 0.0036 0.407
## 305 0.0043 0.378
## 306 0.0046 0.440
## 307 0.0048 0.394
## 308 0.0046 0.330
## 'data.frame': 308 obs. of 11 variables:
## $ Year : int 1983 1983 1983 1983 1983 1983 1983 1983 1984 1984 ...
## $ Month : int 5 6 7 8 9 10 11 12 1 2 ...
## $ MEI : num 2.556 2.167 1.741 1.13 0.428 ...
## $ CO2 : num 346 346 344 342 340 ...
## $ CH4 : num 1639 1634 1633 1631 1648 ...
## $ N2O : num 304 304 304 304 304 ...
## $ CFC.11 : num 191 192 193 194 194 ...
## $ CFC.12 : num 350 352 354 356 357 ...
## $ TSI : num 1366 1366 1366 1366 1366 ...
## $ Aerosols: num 0.0863 0.0794 0.0731 0.0673 0.0619 0.0569 0.0524 0.0486 0.0451 0.0416 ...
## $ Temp : num 0.109 0.118 0.137 0.176 0.149 0.093 0.232 0.078 0.089 0.013 ...
## - attr(*, "comment")= chr "glb_trnobs_df"
## NULL
# glb_trnobs_df <- read.delim("data/hygiene.txt", header=TRUE, fill=TRUE, sep="\t",
# fileEncoding='iso-8859-1')
# glb_trnobs_df <- read.table("data/hygiene.dat.labels", col.names=c("dirty"),
# na.strings="[none]")
# glb_trnobs_df$review <- readLines("data/hygiene.dat", n =-1)
# comment(glb_trnobs_df) <- "glb_trnobs_df"
# glb_trnobs_df <- data.frame()
# for (symbol in c("Boeing", "CocaCola", "GE", "IBM", "ProcterGamble")) {
# sym_trnobs_df <-
# myimport_data(url=gsub("IBM", symbol, glb_trnng_url), comment="glb_trnobs_df",
# force_header=TRUE)
# sym_trnobs_df$Symbol <- symbol
# glb_trnobs_df <- myrbind_df(glb_trnobs_df, sym_trnobs_df)
# }
# glb_trnobs_df <-
# glb_trnobs_df %>% dplyr::filter(Year >= 1999)
if (glb_is_separate_newobs_dataset) {
glb_newobs_df <- myimport_data(url=glb_newdt_url, comment="glb_newobs_df",
force_header=TRUE)
# To make plots / stats / checks easier in chunk:inspectORexplore.data
glb_allobs_df <- myrbind_df(glb_trnobs_df, glb_newobs_df);
comment(glb_allobs_df) <- "glb_allobs_df"
} else {
glb_allobs_df <- glb_trnobs_df; comment(glb_allobs_df) <- "glb_allobs_df"
if (!glb_split_entity_newobs_datasets) {
stop("Not implemented yet")
glb_newobs_df <- glb_trnobs_df[sample(1:nrow(glb_trnobs_df),
max(2, nrow(glb_trnobs_df) / 1000)),]
} else if (glb_split_newdata_method == "condition") {
glb_newobs_df <- do.call("subset",
list(glb_trnobs_df, parse(text=glb_split_newdata_condition)))
glb_trnobs_df <- do.call("subset",
list(glb_trnobs_df, parse(text=paste0("!(",
glb_split_newdata_condition,
")"))))
} else if (glb_split_newdata_method == "sample") {
require(caTools)
set.seed(glb_split_sample.seed)
split <- sample.split(glb_trnobs_df[, glb_rsp_var_raw],
SplitRatio=(1-glb_split_newdata_size_ratio))
glb_newobs_df <- glb_trnobs_df[!split, ]
glb_trnobs_df <- glb_trnobs_df[split ,]
} else if (glb_split_newdata_method == "copy") {
glb_trnobs_df <- glb_allobs_df
comment(glb_trnobs_df) <- "glb_trnobs_df"
glb_newobs_df <- glb_allobs_df
comment(glb_newobs_df) <- "glb_newobs_df"
} else stop("glb_split_newdata_method should be %in% c('condition', 'sample', 'copy')")
comment(glb_newobs_df) <- "glb_newobs_df"
myprint_df(glb_newobs_df)
str(glb_newobs_df)
if (glb_split_entity_newobs_datasets) {
myprint_df(glb_trnobs_df)
str(glb_trnobs_df)
}
}
## Year Month MEI CO2 CH4 N2O CFC.11 CFC.12 TSI
## 285 2007 1 0.974 382.93 1799.66 320.561 248.372 539.206 1365.717
## 286 2007 2 0.510 383.81 1803.08 320.571 248.264 538.973 1365.715
## 287 2007 3 0.074 384.56 1803.10 320.548 247.997 538.811 1365.754
## 288 2007 4 -0.049 386.40 1802.11 320.518 247.574 538.586 1365.723
## 289 2007 5 0.183 386.58 1795.65 320.445 247.224 538.130 1365.693
## 290 2007 6 -0.358 386.05 1781.81 320.332 246.881 537.376 1365.762
## Aerosols Temp
## 285 0.0054 0.601
## 286 0.0051 0.498
## 287 0.0045 0.435
## 288 0.0045 0.466
## 289 0.0041 0.372
## 290 0.0040 0.382
## Year Month MEI CO2 CH4 N2O CFC.11 CFC.12 TSI
## 285 2007 1 0.974 382.93 1799.66 320.561 248.372 539.206 1365.717
## 288 2007 4 -0.049 386.40 1802.11 320.518 247.574 538.586 1365.723
## 293 2007 9 -1.162 380.90 1794.21 320.618 246.214 537.281 1365.716
## 295 2007 11 -1.177 382.42 1803.79 321.062 246.178 537.319 1365.668
## 305 2008 9 -0.643 383.09 1795.08 321.529 244.083 535.048 1365.665
## 307 2008 11 -0.621 384.13 1812.37 322.013 244.225 534.906 1365.707
## Aerosols Temp
## 285 0.0054 0.601
## 288 0.0045 0.466
## 293 0.0042 0.402
## 295 0.0042 0.266
## 305 0.0043 0.378
## 307 0.0048 0.394
## Year Month MEI CO2 CH4 N2O CFC.11 CFC.12 TSI
## 303 2008 7 0.003 386.42 1782.93 321.372 244.434 535.026 1365.672
## 304 2008 8 -0.266 384.15 1779.88 321.405 244.200 535.072 1365.657
## 305 2008 9 -0.643 383.09 1795.08 321.529 244.083 535.048 1365.665
## 306 2008 10 -0.780 382.99 1814.18 321.796 244.080 534.927 1365.676
## 307 2008 11 -0.621 384.13 1812.37 322.013 244.225 534.906 1365.707
## 308 2008 12 -0.666 385.56 1812.88 322.182 244.204 535.005 1365.693
## Aerosols Temp
## 303 0.0033 0.406
## 304 0.0036 0.407
## 305 0.0043 0.378
## 306 0.0046 0.440
## 307 0.0048 0.394
## 308 0.0046 0.330
## 'data.frame': 24 obs. of 11 variables:
## $ Year : int 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
## $ Month : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MEI : num 0.974 0.51 0.074 -0.049 0.183 ...
## $ CO2 : num 383 384 385 386 387 ...
## $ CH4 : num 1800 1803 1803 1802 1796 ...
## $ N2O : num 321 321 321 321 320 ...
## $ CFC.11 : num 248 248 248 248 247 ...
## $ CFC.12 : num 539 539 539 539 538 ...
## $ TSI : num 1366 1366 1366 1366 1366 ...
## $ Aerosols: num 0.0054 0.0051 0.0045 0.0045 0.0041 0.004 0.004 0.0041 0.0042 0.0041 ...
## $ Temp : num 0.601 0.498 0.435 0.466 0.372 0.382 0.394 0.358 0.402 0.362 ...
## - attr(*, "comment")= chr "glb_newobs_df"
## Year Month MEI CO2 CH4 N2O CFC.11 CFC.12 TSI
## 1 1983 5 2.556 345.96 1638.59 303.677 191.324 350.113 1366.102
## 2 1983 6 2.167 345.52 1633.71 303.746 192.057 351.848 1366.121
## 3 1983 7 1.741 344.15 1633.22 303.795 192.818 353.725 1366.285
## 4 1983 8 1.130 342.25 1631.35 303.839 193.602 355.633 1366.420
## 5 1983 9 0.428 340.17 1648.40 303.901 194.392 357.465 1366.234
## 6 1983 10 0.002 340.30 1663.79 303.970 195.171 359.174 1366.059
## Aerosols Temp
## 1 0.0863 0.109
## 2 0.0794 0.118
## 3 0.0731 0.137
## 4 0.0673 0.176
## 5 0.0619 0.149
## 6 0.0569 0.093
## Year Month MEI CO2 CH4 N2O CFC.11 CFC.12 TSI
## 93 1991 1 0.309 354.87 1734.82 309.404 265.709 489.664 1366.053
## 129 1994 1 0.338 358.22 1759.84 309.951 271.238 514.991 1365.737
## 180 1998 4 2.673 368.61 1780.30 313.240 266.054 535.409 1366.090
## 199 1999 11 -1.079 366.68 1796.88 315.006 264.266 540.312 1366.352
## 271 2005 11 -0.407 378.29 1803.68 319.525 251.048 541.756 1365.779
## 272 2005 12 -0.585 379.92 1798.89 319.688 250.989 541.799 1365.855
## Aerosols Temp
## 93 0.0060 0.224
## 129 0.0285 0.160
## 180 0.0031 0.608
## 199 0.0021 0.223
## 271 0.0037 0.478
## 272 0.0036 0.366
## Year Month MEI CO2 CH4 N2O CFC.11 CFC.12 TSI
## 279 2006 7 0.628 382.38 1765.95 319.872 249.247 539.725 1365.821
## 280 2006 8 0.759 380.45 1762.66 319.930 248.981 539.682 1365.707
## 281 2006 9 0.793 378.92 1776.04 320.010 248.775 539.566 1365.842
## 282 2006 10 0.892 379.16 1789.02 320.125 248.666 539.488 1365.827
## 283 2006 11 1.292 380.18 1791.91 320.321 248.605 539.500 1365.704
## 284 2006 12 0.951 381.79 1795.04 320.451 248.480 539.377 1365.709
## Aerosols Temp
## 279 0.0038 0.456
## 280 0.0041 0.482
## 281 0.0043 0.425
## 282 0.0044 0.472
## 283 0.0049 0.440
## 284 0.0054 0.518
## 'data.frame': 284 obs. of 11 variables:
## $ Year : int 1983 1983 1983 1983 1983 1983 1983 1983 1984 1984 ...
## $ Month : int 5 6 7 8 9 10 11 12 1 2 ...
## $ MEI : num 2.556 2.167 1.741 1.13 0.428 ...
## $ CO2 : num 346 346 344 342 340 ...
## $ CH4 : num 1639 1634 1633 1631 1648 ...
## $ N2O : num 304 304 304 304 304 ...
## $ CFC.11 : num 191 192 193 194 194 ...
## $ CFC.12 : num 350 352 354 356 357 ...
## $ TSI : num 1366 1366 1366 1366 1366 ...
## $ Aerosols: num 0.0863 0.0794 0.0731 0.0673 0.0619 0.0569 0.0524 0.0486 0.0451 0.0416 ...
## $ Temp : num 0.109 0.118 0.137 0.176 0.149 0.093 0.232 0.078 0.089 0.013 ...
if ((num_nas <- sum(is.na(glb_trnobs_df[, glb_rsp_var_raw]))) > 0)
stop("glb_trnobs_df$", glb_rsp_var_raw, " contains NAs for ", num_nas, " obs")
if (nrow(glb_trnobs_df) == nrow(glb_allobs_df))
warning("glb_trnobs_df same as glb_allobs_df")
if (nrow(glb_newobs_df) == nrow(glb_allobs_df))
warning("glb_newobs_df same as glb_allobs_df")
if (length(glb_drop_vars) > 0) {
warning("dropping vars: ", paste0(glb_drop_vars, collapse=", "))
glb_allobs_df <- glb_allobs_df[, setdiff(names(glb_allobs_df), glb_drop_vars)]
glb_trnobs_df <- glb_trnobs_df[, setdiff(names(glb_trnobs_df), glb_drop_vars)]
glb_newobs_df <- glb_newobs_df[, setdiff(names(glb_newobs_df), glb_drop_vars)]
}
#stop(here"); sav_allobs_df <- glb_allobs_df # glb_allobs_df <- sav_allobs_df
# Check for duplicates in glb_id_var
if (length(glb_id_var) == 0) {
warning("using .rownames as identifiers for observations")
glb_allobs_df$.rownames <- rownames(glb_allobs_df)
glb_trnobs_df$.rownames <- rownames(glb_trnobs_df)
glb_newobs_df$.rownames <- rownames(glb_newobs_df)
glb_id_var <- ".rownames"
}
## Warning: using .rownames as identifiers for observations
if (sum(duplicated(glb_allobs_df[, glb_id_var, FALSE])) > 0)
stop(glb_id_var, " duplicated in glb_allobs_df")
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, glb_id_var)
# Combine trnent & newobs into glb_allobs_df for easier manipulation
glb_trnobs_df$.src <- "Train"; glb_newobs_df$.src <- "Test";
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, ".src")
glb_allobs_df <- myrbind_df(glb_trnobs_df, glb_newobs_df)
comment(glb_allobs_df) <- "glb_allobs_df"
glb_allobs_df <- orderBy(reformulate(glb_id_var), glb_allobs_df)
glb_trnobs_df <- glb_newobs_df <- NULL
glb_chunks_df <- myadd_chunk(glb_chunks_df, "inspect.data", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 1 import.data 1 0 9.500 9.896 0.396
## 2 inspect.data 2 0 9.896 NA NA
2.0: inspect data#print(str(glb_allobs_df))
#View(glb_allobs_df)
dsp_class_dstrb <- function(var) {
xtab_df <- mycreate_xtab_df(glb_allobs_df, c(".src", var))
rownames(xtab_df) <- xtab_df$.src
xtab_df <- subset(xtab_df, select=-.src)
print(xtab_df)
print(xtab_df / rowSums(xtab_df, na.rm=TRUE))
}
# Performed repeatedly in other chunks
glb_chk_data <- function() {
# Histogram of predictor in glb_trnobs_df & glb_newobs_df
print(myplot_histogram(glb_allobs_df, glb_rsp_var_raw) + facet_wrap(~ .src))
if (glb_is_classification)
dsp_class_dstrb(var=ifelse(glb_rsp_var %in% names(glb_allobs_df),
glb_rsp_var, glb_rsp_var_raw))
mycheck_problem_data(glb_allobs_df)
}
glb_chk_data()
## [1] "numeric data missing in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ 0s in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ Infs in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ NaNs in glb_allobs_df: "
## named integer(0)
## [1] "string data missing in glb_allobs_df: "
## .rownames
## 0
# Create new features that help diagnostics
if (!is.null(glb_map_rsp_raw_to_var)) {
glb_allobs_df[, glb_rsp_var] <-
glb_map_rsp_raw_to_var(glb_allobs_df[, glb_rsp_var_raw])
mycheck_map_results(mapd_df=glb_allobs_df,
from_col_name=glb_rsp_var_raw, to_col_name=glb_rsp_var)
if (glb_is_classification) dsp_class_dstrb(glb_rsp_var)
}
# Convert dates to numbers
# typically, dates come in as chars;
# so this must be done before converting chars to factors
myextract_dates_df <- function(df, vars, id_vars, rsp_var) {
keep_feats <- c(NULL)
for (var in vars) {
dates_df <- df[, id_vars, FALSE]
dates_df[, rsp_var] <- df[, rsp_var, FALSE]
#dates_df <- data.frame(.date=strptime(df[, var], "%Y-%m-%d %H:%M:%S"))
dates_df <- cbind(dates_df, data.frame(.date=strptime(df[, var],
glb_date_fmts[[var]], tz=glb_date_tzs[[var]])))
# print(dates_df[is.na(dates_df$.date), c("ID", "Arrest.fctr", ".date")])
# print(glb_allobs_df[is.na(dates_df$.date), c("ID", "Arrest.fctr", "Date")])
# print(head(glb_allobs_df[grepl("4/7/02 .:..", glb_allobs_df$Date), c("ID", "Arrest.fctr", "Date")]))
# print(head(strptime(glb_allobs_df[grepl("4/7/02 .:..", glb_allobs_df$Date), "Date"], "%m/%e/%y %H:%M"))
# Wrong data during EST->EDT transition
# tmp <- strptime("4/7/02 2:00","%m/%e/%y %H:%M:%S"); print(tmp); print(is.na(tmp))
# dates_df[dates_df$ID == 2068197, .date] <- tmp
# grep("(.*?) 2:(.*)", glb_allobs_df[is.na(dates_df$.date), "Date"], value=TRUE)
# dates_df[is.na(dates_df$.date), ".date"] <-
# data.frame(.date=strptime(gsub("(.*?) 2:(.*)", "\\1 3:\\2",
# glb_allobs_df[is.na(dates_df$.date), "Date"]), "%m/%e/%y %H:%M"))$.date
if (sum(is.na(dates_df$.date)) > 0) {
stop("NA POSIX dates for ", var)
print(df[is.na(dates_df$.date), c(id_vars, rsp_var, var)])
}
.date <- dates_df$.date
dates_df[, paste0(var, ".POSIX")] <- .date
dates_df[, paste0(var, ".year")] <- as.numeric(format(.date, "%Y"))
dates_df[, paste0(var, ".year.fctr")] <- as.factor(format(.date, "%Y"))
dates_df[, paste0(var, ".month")] <- as.numeric(format(.date, "%m"))
dates_df[, paste0(var, ".month.fctr")] <- as.factor(format(.date, "%m"))
dates_df[, paste0(var, ".date")] <- as.numeric(format(.date, "%d"))
dates_df[, paste0(var, ".date.fctr")] <-
cut(as.numeric(format(.date, "%d")), 5) # by month week
dates_df[, paste0(var, ".juliandate")] <- as.numeric(format(.date, "%j"))
# wkday Sun=0; Mon=1; ...; Sat=6
dates_df[, paste0(var, ".wkday")] <- as.numeric(format(.date, "%w"))
dates_df[, paste0(var, ".wkday.fctr")] <- as.factor(format(.date, "%w"))
# Get US Federal Holidays for relevant years
require(XML)
doc.html = htmlTreeParse('http://about.usps.com/news/events-calendar/2012-federal-holidays.htm', useInternal = TRUE)
# # Extract all the paragraphs (HTML tag is p, starting at
# # the root of the document). Unlist flattens the list to
# # create a character vector.
# doc.text = unlist(xpathApply(doc.html, '//p', xmlValue))
# # Replace all \n by spaces
# doc.text = gsub('\\n', ' ', doc.text)
# # Join all the elements of the character vector into a single
# # character string, separated by spaces
# doc.text = paste(doc.text, collapse = ' ')
# parse the tree by tables
txt <- unlist(strsplit(xpathSApply(doc.html, "//*/table", xmlValue), "\n"))
# do some clean up with regular expressions
txt <- grep("day, ", txt, value=TRUE)
txt <- trimws(gsub("(.*?)day, (.*)", "\\2", txt))
# txt <- gsub("\t","",txt)
# txt <- sub("^[[:space:]]*(.*?)[[:space:]]*$", "\\1", txt, perl=TRUE)
# txt <- txt[!(txt %in% c("", "|"))]
hldays <- strptime(paste(txt, ", 2012", sep=""), "%B %e, %Y")
dates_df[, paste0(var, ".hlday")] <-
ifelse(format(.date, "%Y-%m-%d") %in% hldays, 1, 0)
# NYState holidays 1.9., 13.10., 11.11., 27.11., 25.12.
dates_df[, paste0(var, ".wkend")] <- as.numeric(
(dates_df[, paste0(var, ".wkday")] %in% c(0, 6)) |
dates_df[, paste0(var, ".hlday")] )
dates_df[, paste0(var, ".hour")] <- as.numeric(format(.date, "%H"))
dates_df[, paste0(var, ".hour.fctr")] <-
if (length(unique(vals <- as.numeric(format(.date, "%H")))) <= 1)
vals else cut(vals, 3) # by work-shift
dates_df[, paste0(var, ".minute")] <- as.numeric(format(.date, "%M"))
dates_df[, paste0(var, ".minute.fctr")] <-
if (length(unique(vals <- as.numeric(format(.date, "%M")))) <= 1)
vals else cut(vals, 4) # by quarter-hours
dates_df[, paste0(var, ".second")] <- as.numeric(format(.date, "%S"))
dates_df[, paste0(var, ".second.fctr")] <-
if (length(unique(vals <- as.numeric(format(.date, "%S")))) <= 1)
vals else cut(vals, 4) # by quarter-minutes
dates_df[, paste0(var, ".day.minutes")] <-
60 * dates_df[, paste0(var, ".hour")] +
dates_df[, paste0(var, ".minute")]
if ((unq_vals_n <- length(unique(dates_df[, paste0(var, ".day.minutes")]))) > 1) {
max_degree <- min(unq_vals_n, 5)
dates_df[, paste0(var, ".day.minutes.poly.", 1:max_degree)] <-
as.matrix(poly(dates_df[, paste0(var, ".day.minutes")], max_degree))
} else max_degree <- 0
# print(gp <- myplot_box(df=dates_df, ycol_names="PubDate.day.minutes",
# xcol_name=rsp_var))
# print(gp <- myplot_scatter(df=dates_df, xcol_name=".rownames",
# ycol_name="PubDate.day.minutes", colorcol_name=rsp_var))
# print(gp <- myplot_scatter(df=dates_df, xcol_name="PubDate.juliandate",
# ycol_name="PubDate.day.minutes.poly.1", colorcol_name=rsp_var))
# print(gp <- myplot_scatter(df=dates_df, xcol_name="PubDate.day.minutes",
# ycol_name="PubDate.day.minutes.poly.4", colorcol_name=rsp_var))
#
# print(gp <- myplot_scatter(df=dates_df, xcol_name="PubDate.juliandate",
# ycol_name="PubDate.day.minutes", colorcol_name=rsp_var, smooth=TRUE))
# print(gp <- myplot_scatter(df=dates_df, xcol_name="PubDate.juliandate",
# ycol_name="PubDate.day.minutes.poly.4", colorcol_name=rsp_var, smooth=TRUE))
# print(gp <- myplot_scatter(df=dates_df, xcol_name="PubDate.juliandate",
# ycol_name=c("PubDate.day.minutes", "PubDate.day.minutes.poly.4"),
# colorcol_name=rsp_var))
# print(gp <- myplot_scatter(df=subset(dates_df, Popular.fctr=="Y"),
# xcol_name=paste0(var, ".juliandate"),
# ycol_name=paste0(var, ".day.minutes", colorcol_name=rsp_var))
# print(gp <- myplot_box(df=dates_df, ycol_names=paste0(var, ".hour"),
# xcol_name=rsp_var))
# print(gp <- myplot_bar(df=dates_df, ycol_names=paste0(var, ".hour.fctr"),
# xcol_name=rsp_var,
# colorcol_name=paste0(var, ".hour.fctr")))
keep_feats <- paste(var,
c(".POSIX", ".year.fctr", ".month.fctr", ".date.fctr", ".wkday.fctr",
".wkend", ".hour.fctr", ".minute.fctr", ".second.fctr"), sep="")
if (max_degree > 0)
keep_feats <- union(keep_feats, paste(var,
paste0(".day.minutes.poly.", 1:max_degree), sep=""))
keep_feats <- intersect(keep_feats, names(dates_df))
}
#myprint_df(dates_df)
return(dates_df[, keep_feats])
}
if (!is.null(glb_date_vars)) {
glb_allobs_df <- cbind(glb_allobs_df,
myextract_dates_df(df=glb_allobs_df, vars=glb_date_vars,
id_vars=glb_id_var, rsp_var=glb_rsp_var))
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
paste(glb_date_vars, c("", ".POSIX"), sep=""))
for (feat in glb_date_vars) {
glb_allobs_df <- orderBy(reformulate(paste0(feat, ".POSIX")), glb_allobs_df)
# print(myplot_scatter(glb_allobs_df, xcol_name=paste0(feat, ".POSIX"),
# ycol_name=glb_rsp_var, colorcol_name=glb_rsp_var))
print(myplot_scatter(glb_allobs_df[glb_allobs_df[, paste0(feat, ".POSIX")] >=
strptime("2012-12-01", "%Y-%m-%d"), ],
xcol_name=paste0(feat, ".POSIX"),
ycol_name=glb_rsp_var, colorcol_name=paste0(feat, ".wkend")))
# Create features that measure the gap between previous timestamp in the data
require(zoo)
z <- zoo(as.numeric(as.POSIXlt(glb_allobs_df[, paste0(feat, ".POSIX")])))
glb_allobs_df[, paste0(feat, ".zoo")] <- z
print(head(glb_allobs_df[, c(glb_id_var, feat, paste0(feat, ".zoo"))]))
print(myplot_scatter(glb_allobs_df[glb_allobs_df[, paste0(feat, ".POSIX")] >
strptime("2012-10-01", "%Y-%m-%d"), ],
xcol_name=paste0(feat, ".zoo"), ycol_name=glb_rsp_var,
colorcol_name=glb_rsp_var))
b <- zoo(, seq(nrow(glb_allobs_df)))
last1 <- as.numeric(merge(z-lag(z, -1), b, all=TRUE)); last1[is.na(last1)] <- 0
glb_allobs_df[, paste0(feat, ".last1.log")] <- log(1 + last1)
print(gp <- myplot_box(df=glb_allobs_df[glb_allobs_df[,
paste0(feat, ".last1.log")] > 0, ],
ycol_names=paste0(feat, ".last1.log"),
xcol_name=glb_rsp_var))
last10 <- as.numeric(merge(z-lag(z, -10), b, all=TRUE)); last10[is.na(last10)] <- 0
glb_allobs_df[, paste0(feat, ".last10.log")] <- log(1 + last10)
print(gp <- myplot_box(df=glb_allobs_df[glb_allobs_df[,
paste0(feat, ".last10.log")] > 0, ],
ycol_names=paste0(feat, ".last10.log"),
xcol_name=glb_rsp_var))
last100 <- as.numeric(merge(z-lag(z, -100), b, all=TRUE)); last100[is.na(last100)] <- 0
glb_allobs_df[, paste0(feat, ".last100.log")] <- log(1 + last100)
print(gp <- myplot_box(df=glb_allobs_df[glb_allobs_df[,
paste0(feat, ".last100.log")] > 0, ],
ycol_names=paste0(feat, ".last100.log"),
xcol_name=glb_rsp_var))
glb_allobs_df <- orderBy(reformulate(glb_id_var), glb_allobs_df)
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
c(paste0(feat, ".zoo")))
# all2$last3 = as.numeric(merge(z-lag(z, -3), b, all = TRUE))
# all2$last5 = as.numeric(merge(z-lag(z, -5), b, all = TRUE))
# all2$last10 = as.numeric(merge(z-lag(z, -10), b, all = TRUE))
# all2$last20 = as.numeric(merge(z-lag(z, -20), b, all = TRUE))
# all2$last50 = as.numeric(merge(z-lag(z, -50), b, all = TRUE))
#
#
# # order table
# all2 = all2[order(all2$id),]
#
# ## fill in NAs
# # count averages
# na.avg = all2 %>% group_by(weekend, hour) %>% dplyr::summarise(
# last1=mean(last1, na.rm=TRUE),
# last3=mean(last3, na.rm=TRUE),
# last5=mean(last5, na.rm=TRUE),
# last10=mean(last10, na.rm=TRUE),
# last20=mean(last20, na.rm=TRUE),
# last50=mean(last50, na.rm=TRUE)
# )
#
# # fill in averages
# na.merge = merge(all2, na.avg, by=c("weekend","hour"))
# na.merge = na.merge[order(na.merge$id),]
# for(i in c("last1", "last3", "last5", "last10", "last20", "last50")) {
# y = paste0(i, ".y")
# idx = is.na(all2[[i]])
# all2[idx,][[i]] <- na.merge[idx,][[y]]
# }
# rm(na.avg, na.merge, b, i, idx, n, pd, sec, sh, y, z)
}
}
# check distribution of all numeric data
dsp_numeric_feats_dstrb <- function(feats_vctr) {
for (feat in feats_vctr) {
print(sprintf("feat: %s", feat))
if (glb_is_regression)
gp <- myplot_scatter(df=glb_allobs_df, ycol_name=glb_rsp_var, xcol_name=feat,
smooth=TRUE)
if (glb_is_classification)
gp <- myplot_box(df=glb_allobs_df, ycol_names=feat, xcol_name=glb_rsp_var)
if (inherits(glb_allobs_df[, feat], "factor"))
gp <- gp + facet_wrap(reformulate(feat))
print(gp)
}
}
# dsp_numeric_vars_dstrb(setdiff(names(glb_allobs_df),
# union(myfind_chr_cols_df(glb_allobs_df),
# c(glb_rsp_var_raw, glb_rsp_var))))
add_new_diag_feats <- function(obs_df, ref_df=glb_allobs_df) {
require(plyr)
obs_df <- mutate(obs_df,
# <col_name>.NA=is.na(<col_name>),
# <col_name>.fctr=factor(<col_name>,
# as.factor(union(obs_df$<col_name>, obs_twin_df$<col_name>))),
# <col_name>.fctr=relevel(factor(<col_name>,
# as.factor(union(obs_df$<col_name>, obs_twin_df$<col_name>))),
# "<ref_val>"),
# <col2_name>.fctr=relevel(factor(ifelse(<col1_name> == <val>, "<oth_val>", "<ref_val>")),
# as.factor(c("R", "<ref_val>")),
# ref="<ref_val>"),
# This doesn't work - use sapply instead
# <col_name>.fctr_num=grep(<col_name>, levels(<col_name>.fctr)),
#
# Date.my=as.Date(strptime(Date, "%m/%d/%y %H:%M")),
# Year=year(Date.my),
# Month=months(Date.my),
# Weekday=weekdays(Date.my)
# <col_name>=<table>[as.character(<col2_name>)],
# <col_name>=as.numeric(<col2_name>),
# <col_name> = trunc(<col2_name> / 100),
.rnorm = rnorm(n=nrow(obs_df))
)
# If levels of a factor are different across obs_df & glb_newobs_df; predict.glm fails
# Transformations not handled by mutate
# obs_df$<col_name>.fctr.num <- sapply(1:nrow(obs_df),
# function(row_ix) grep(obs_df[row_ix, "<col_name>"],
# levels(obs_df[row_ix, "<col_name>.fctr"])))
#print(summary(obs_df))
#print(sapply(names(obs_df), function(col) sum(is.na(obs_df[, col]))))
return(obs_df)
}
glb_allobs_df <- add_new_diag_feats(glb_allobs_df)
## Loading required package: plyr
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#stop(here"); sav_allobs_df <- glb_allobs_df # glb_allobs_df <- sav_allobs_df
# Merge some <descriptor>
# glb_allobs_df$<descriptor>.my <- glb_allobs_df$<descriptor>
# glb_allobs_df[grepl("\\bAIRPORT\\b", glb_allobs_df$<descriptor>.my),
# "<descriptor>.my"] <- "AIRPORT"
# glb_allobs_df$<descriptor>.my <-
# plyr::revalue(glb_allobs_df$<descriptor>.my, c(
# "ABANDONED BUILDING" = "OTHER",
# "##" = "##"
# ))
# print(<descriptor>_freq_df <- mycreate_sqlxtab_df(glb_allobs_df, c("<descriptor>.my")))
# # print(dplyr::filter(<descriptor>_freq_df, grepl("(MEDICAL|DENTAL|OFFICE)", <descriptor>.my)))
# # print(dplyr::filter(dplyr::select(glb_allobs_df, -<var.zoo>),
# # grepl("STORE", <descriptor>.my)))
# glb_exclude_vars_as_features <- c(glb_exclude_vars_as_features, "<descriptor>")
# Check distributions of newly transformed / extracted vars
# Enhancement: remove vars that were displayed ealier
dsp_numeric_feats_dstrb(feats_vctr=setdiff(names(glb_allobs_df),
c(myfind_chr_cols_df(glb_allobs_df), glb_rsp_var_raw, glb_rsp_var,
glb_exclude_vars_as_features)))
## [1] "feat: Year"
## [1] "feat: Month"
## [1] "feat: MEI"
## [1] "feat: CO2"
## [1] "feat: CH4"
## [1] "feat: N2O"
## [1] "feat: CFC.11"
## [1] "feat: CFC.12"
## [1] "feat: TSI"
## [1] "feat: Aerosols"
## [1] "feat: .rnorm"
# Convert factors to dummy variables
# Build splines require(splines); bsBasis <- bs(training$age, df=3)
#pairs(subset(glb_trnobs_df, select=-c(col_symbol)))
# Check for glb_newobs_df & glb_trnobs_df features range mismatches
# Other diagnostics:
# print(subset(glb_trnobs_df, <col1_name> == max(glb_trnobs_df$<col1_name>, na.rm=TRUE) &
# <col2_name> <= mean(glb_trnobs_df$<col1_name>, na.rm=TRUE)))
# print(glb_trnobs_df[which.max(glb_trnobs_df$<col_name>),])
# print(<col_name>_freq_glb_trnobs_df <- mycreate_tbl_df(glb_trnobs_df, "<col_name>"))
# print(which.min(table(glb_trnobs_df$<col_name>)))
# print(which.max(table(glb_trnobs_df$<col_name>)))
# print(which.max(table(glb_trnobs_df$<col1_name>, glb_trnobs_df$<col2_name>)[, 2]))
# print(table(glb_trnobs_df$<col1_name>, glb_trnobs_df$<col2_name>))
# print(table(is.na(glb_trnobs_df$<col1_name>), glb_trnobs_df$<col2_name>))
# print(table(sign(glb_trnobs_df$<col1_name>), glb_trnobs_df$<col2_name>))
# print(mycreate_xtab_df(glb_trnobs_df, <col1_name>))
# print(mycreate_xtab_df(glb_trnobs_df, c(<col1_name>, <col2_name>)))
# print(<col1_name>_<col2_name>_xtab_glb_trnobs_df <-
# mycreate_xtab_df(glb_trnobs_df, c("<col1_name>", "<col2_name>")))
# <col1_name>_<col2_name>_xtab_glb_trnobs_df[is.na(<col1_name>_<col2_name>_xtab_glb_trnobs_df)] <- 0
# print(<col1_name>_<col2_name>_xtab_glb_trnobs_df <-
# mutate(<col1_name>_<col2_name>_xtab_glb_trnobs_df,
# <col3_name>=(<col1_name> * 1.0) / (<col1_name> + <col2_name>)))
# print(mycreate_sqlxtab_df(glb_allobs_df, c("<col1_name>", "<col2_name>")))
# print(<col2_name>_min_entity_arr <-
# sort(tapply(glb_trnobs_df$<col1_name>, glb_trnobs_df$<col2_name>, min, na.rm=TRUE)))
# print(<col1_name>_na_by_<col2_name>_arr <-
# sort(tapply(glb_trnobs_df$<col1_name>.NA, glb_trnobs_df$<col2_name>, mean, na.rm=TRUE)))
# Other plots:
# print(myplot_box(df=glb_trnobs_df, ycol_names="<col1_name>"))
# print(myplot_box(df=glb_trnobs_df, ycol_names="<col1_name>", xcol_name="<col2_name>"))
# print(myplot_line(subset(glb_trnobs_df, Symbol %in% c("CocaCola", "ProcterGamble")),
# "Date.POSIX", "StockPrice", facet_row_colnames="Symbol") +
# geom_vline(xintercept=as.numeric(as.POSIXlt("2003-03-01"))) +
# geom_vline(xintercept=as.numeric(as.POSIXlt("1983-01-01")))
# )
# print(myplot_line(subset(glb_trnobs_df, Date.POSIX > as.POSIXct("2004-01-01")),
# "Date.POSIX", "StockPrice") +
# geom_line(aes(color=Symbol)) +
# coord_cartesian(xlim=c(as.POSIXct("1990-01-01"),
# as.POSIXct("2000-01-01"))) +
# coord_cartesian(ylim=c(0, 250)) +
# geom_vline(xintercept=as.numeric(as.POSIXlt("1997-09-01"))) +
# geom_vline(xintercept=as.numeric(as.POSIXlt("1997-11-01")))
# )
# print(myplot_scatter(glb_allobs_df, "<col1_name>", "<col2_name>", smooth=TRUE))
# print(myplot_scatter(glb_allobs_df, "<col1_name>", "<col2_name>", colorcol_name="<Pred.fctr>") +
# geom_point(data=subset(glb_allobs_df, <condition>),
# mapping=aes(x=<x_var>, y=<y_var>), color="red", shape=4, size=5) +
# geom_vline(xintercept=84))
rm(last1, last10, last100)
## Warning in rm(last1, last10, last100): object 'last1' not found
## Warning in rm(last1, last10, last100): object 'last10' not found
## Warning in rm(last1, last10, last100): object 'last100' not found
glb_chunks_df <- myadd_chunk(glb_chunks_df, "scrub.data", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 2 inspect.data 2 0 9.896 14.97 5.074
## 3 scrub.data 2 1 14.971 NA NA
2.1: scrub datamycheck_problem_data(glb_allobs_df)
## [1] "numeric data missing in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ 0s in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ Infs in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ NaNs in glb_allobs_df: "
## named integer(0)
## [1] "string data missing in glb_allobs_df: "
## .rownames
## 0
dsp_catgs <- function() {
print("NewsDesk:")
print(table(glb_allobs_df$NewsDesk))
print("SectionName:")
print(table(glb_allobs_df$SectionName))
print("SubsectionName:")
print(table(glb_allobs_df$SubsectionName))
}
# sel_obs <- function(Popular=NULL,
# NewsDesk=NULL, SectionName=NULL, SubsectionName=NULL,
# Headline.contains=NULL, Snippet.contains=NULL, Abstract.contains=NULL,
# Headline.pfx=NULL, NewsDesk.nb=NULL, .clusterid=NULL, myCategory=NULL,
# perl=FALSE) {
sel_obs <- function(vars_lst) {
tmp_df <- glb_allobs_df
# Does not work for Popular == NAs ???
if (!is.null(Popular)) {
if (is.na(Popular))
tmp_df <- tmp_df[is.na(tmp_df$Popular), ] else
tmp_df <- tmp_df[tmp_df$Popular == Popular, ]
}
if (!is.null(NewsDesk))
tmp_df <- tmp_df[tmp_df$NewsDesk == NewsDesk, ]
if (!is.null(SectionName))
tmp_df <- tmp_df[tmp_df$SectionName == SectionName, ]
if (!is.null(SubsectionName))
tmp_df <- tmp_df[tmp_df$SubsectionName == SubsectionName, ]
if (!is.null(Headline.contains))
tmp_df <-
tmp_df[grep(Headline.contains, tmp_df$Headline, perl=perl), ]
if (!is.null(Snippet.contains))
tmp_df <-
tmp_df[grep(Snippet.contains, tmp_df$Snippet, perl=perl), ]
if (!is.null(Abstract.contains))
tmp_df <-
tmp_df[grep(Abstract.contains, tmp_df$Abstract, perl=perl), ]
if (!is.null(Headline.pfx)) {
if (length(grep("Headline.pfx", names(tmp_df), fixed=TRUE, value=TRUE))
> 0) tmp_df <-
tmp_df[tmp_df$Headline.pfx == Headline.pfx, ] else
warning("glb_allobs_df does not contain Headline.pfx; ignoring that filter")
}
if (!is.null(NewsDesk.nb)) {
if (any(grepl("NewsDesk.nb", names(tmp_df), fixed=TRUE)) > 0)
tmp_df <-
tmp_df[tmp_df$NewsDesk.nb == NewsDesk.nb, ] else
warning("glb_allobs_df does not contain NewsDesk.nb; ignoring that filter")
}
if (!is.null(.clusterid)) {
if (any(grepl(".clusterid", names(tmp_df), fixed=TRUE)) > 0)
tmp_df <-
tmp_df[tmp_df$clusterid == clusterid, ] else
warning("glb_allobs_df does not contain clusterid; ignoring that filter") }
if (!is.null(myCategory)) {
if (!(myCategory %in% names(glb_allobs_df)))
tmp_df <-
tmp_df[tmp_df$myCategory == myCategory, ] else
warning("glb_allobs_df does not contain myCategory; ignoring that filter")
}
return(glb_allobs_df$UniqueID %in% tmp_df$UniqueID)
}
dsp_obs <- function(..., cols=c(NULL), all=FALSE) {
tmp_df <- glb_allobs_df[sel_obs(...),
union(c("UniqueID", "Popular", "myCategory", "Headline"), cols), FALSE]
if(all) { print(tmp_df) } else { myprint_df(tmp_df) }
}
#dsp_obs(Popular=1, NewsDesk="", SectionName="", Headline.contains="Boehner")
# dsp_obs(Popular=1, NewsDesk="", SectionName="")
# dsp_obs(Popular=NA, NewsDesk="", SectionName="")
dsp_tbl <- function(...) {
tmp_entity_df <- glb_allobs_df[sel_obs(...), ]
tmp_tbl <- table(tmp_entity_df$NewsDesk,
tmp_entity_df$SectionName,
tmp_entity_df$SubsectionName,
tmp_entity_df$Popular, useNA="ifany")
#print(names(tmp_tbl))
#print(dimnames(tmp_tbl))
print(tmp_tbl)
}
dsp_hdlxtab <- function(str)
print(mycreate_sqlxtab_df(glb_allobs_df[sel_obs(Headline.contains=str), ],
c("Headline.pfx", "Headline", glb_rsp_var)))
#dsp_hdlxtab("(1914)|(1939)")
dsp_catxtab <- function(str)
print(mycreate_sqlxtab_df(glb_allobs_df[sel_obs(Headline.contains=str), ],
c("Headline.pfx", "NewsDesk", "SectionName", "SubsectionName", glb_rsp_var)))
# dsp_catxtab("1914)|(1939)")
# dsp_catxtab("19(14|39|64):")
# dsp_catxtab("19..:")
# Create myCategory <- NewsDesk#SectionName#SubsectionName
# Fix some data before merging categories
# glb_allobs_df[sel_obs(Headline.contains="Your Turn:", NewsDesk=""),
# "NewsDesk"] <- "Styles"
# glb_allobs_df[sel_obs(Headline.contains="School", NewsDesk="", SectionName="U.S.",
# SubsectionName=""),
# "SubsectionName"] <- "Education"
# glb_allobs_df[sel_obs(Headline.contains="Today in Small Business:", NewsDesk="Business"),
# "SectionName"] <- "Business Day"
# glb_allobs_df[sel_obs(Headline.contains="Today in Small Business:", NewsDesk="Business"),
# "SubsectionName"] <- "Small Business"
# glb_allobs_df[sel_obs(Headline.contains="Readers Respond:"),
# "SectionName"] <- "Opinion"
# glb_allobs_df[sel_obs(Headline.contains="Readers Respond:"),
# "SubsectionName"] <- "Room For Debate"
# glb_allobs_df[sel_obs(NewsDesk="Business", SectionName="", SubsectionName="", Popular=NA),
# "SubsectionName"] <- "Small Business"
# print(glb_allobs_df[glb_allobs_df$UniqueID %in% c(7973),
# c("UniqueID", "Headline", "myCategory", "NewsDesk", "SectionName", "SubsectionName")])
#
# glb_allobs_df[sel_obs(NewsDesk="Business", SectionName="", SubsectionName=""),
# "SectionName"] <- "Technology"
# print(glb_allobs_df[glb_allobs_df$UniqueID %in% c(5076, 5736, 5924, 5911, 6532),
# c("UniqueID", "Headline", "myCategory", "NewsDesk", "SectionName", "SubsectionName")])
#
# glb_allobs_df[sel_obs(SectionName="Health"),
# "NewsDesk"] <- "Science"
# glb_allobs_df[sel_obs(SectionName="Travel"),
# "NewsDesk"] <- "Travel"
#
# glb_allobs_df[sel_obs(SubsectionName="Fashion & Style"),
# "SectionName"] <- ""
# glb_allobs_df[sel_obs(SubsectionName="Fashion & Style"),
# "SubsectionName"] <- ""
# glb_allobs_df[sel_obs(NewsDesk="Styles", SectionName="", SubsectionName="", Popular=1),
# "SectionName"] <- "U.S."
# print(glb_allobs_df[glb_allobs_df$UniqueID %in% c(5486),
# c("UniqueID", "Headline", "myCategory", "NewsDesk", "SectionName", "SubsectionName")])
#
# glb_allobs_df$myCategory <- paste(glb_allobs_df$NewsDesk,
# glb_allobs_df$SectionName,
# glb_allobs_df$SubsectionName,
# sep="#")
# dsp_obs( Headline.contains="Music:"
# #,NewsDesk=""
# #,SectionName=""
# #,SubsectionName="Fashion & Style"
# #,Popular=1 #NA
# ,cols= c("UniqueID", "Headline", "Popular", "myCategory",
# "NewsDesk", "SectionName", "SubsectionName"),
# all=TRUE)
# dsp_obs( Headline.contains="."
# ,NewsDesk=""
# ,SectionName="Opinion"
# ,SubsectionName=""
# #,Popular=1 #NA
# ,cols= c("UniqueID", "Headline", "Popular", "myCategory",
# "NewsDesk", "SectionName", "SubsectionName"),
# all=TRUE)
# Merge some categories
# glb_allobs_df$myCategory <-
# plyr::revalue(glb_allobs_df$myCategory, c(
# "#Business Day#Dealbook" = "Business#Business Day#Dealbook",
# "#Business Day#Small Business" = "Business#Business Day#Small Business",
# "#Crosswords/Games#" = "Business#Crosswords/Games#",
# "Business##" = "Business#Technology#",
# "#Open#" = "Business#Technology#",
# "#Technology#" = "Business#Technology#",
#
# "#Arts#" = "Culture#Arts#",
# "Culture##" = "Culture#Arts#",
#
# "#World#Asia Pacific" = "Foreign#World#Asia Pacific",
# "Foreign##" = "Foreign#World#",
#
# "#N.Y. / Region#" = "Metro#N.Y. / Region#",
#
# "#Opinion#" = "OpEd#Opinion#",
# "OpEd##" = "OpEd#Opinion#",
#
# "#Health#" = "Science#Health#",
# "Science##" = "Science#Health#",
#
# "Styles##" = "Styles##Fashion",
# "Styles#Health#" = "Science#Health#",
# "Styles#Style#Fashion & Style" = "Styles##Fashion",
#
# "#Travel#" = "Travel#Travel#",
#
# "Magazine#Magazine#" = "myOther",
# "National##" = "myOther",
# "National#U.S.#Politics" = "myOther",
# "Sports##" = "myOther",
# "Sports#Sports#" = "myOther",
# "#U.S.#" = "myOther",
#
#
# # "Business##Small Business" = "Business#Business Day#Small Business",
# #
# # "#Opinion#" = "#Opinion#Room For Debate",
# "##" = "##"
# # "Business##" = "Business#Business Day#Dealbook",
# # "Foreign#World#" = "Foreign##",
# # "#Open#" = "Other",
# # "#Opinion#The Public Editor" = "OpEd#Opinion#",
# # "Styles#Health#" = "Styles##",
# # "Styles#Style#Fashion & Style" = "Styles##",
# # "#U.S.#" = "#U.S.#Education",
# ))
# ctgry_xtab_df <- orderBy(reformulate(c("-", ".n")),
# mycreate_sqlxtab_df(glb_allobs_df,
# c("myCategory", "NewsDesk", "SectionName", "SubsectionName", glb_rsp_var)))
# myprint_df(ctgry_xtab_df)
# write.table(ctgry_xtab_df, paste0(glb_out_pfx, "ctgry_xtab.csv"),
# row.names=FALSE)
# ctgry_cast_df <- orderBy(~ -Y -NA, dcast(ctgry_xtab_df,
# myCategory + NewsDesk + SectionName + SubsectionName ~
# Popular.fctr, sum, value.var=".n"))
# myprint_df(ctgry_cast_df)
# write.table(ctgry_cast_df, paste0(glb_out_pfx, "ctgry_cast.csv"),
# row.names=FALSE)
# print(ctgry_sum_tbl <- table(glb_allobs_df$myCategory, glb_allobs_df[, glb_rsp_var],
# useNA="ifany"))
dsp_chisq.test <- function(...) {
sel_df <- glb_allobs_df[sel_obs(...) &
!is.na(glb_allobs_df$Popular), ]
sel_df$.marker <- 1
ref_df <- glb_allobs_df[!is.na(glb_allobs_df$Popular), ]
mrg_df <- merge(ref_df[, c(glb_id_var, "Popular")],
sel_df[, c(glb_id_var, ".marker")], all.x=TRUE)
mrg_df[is.na(mrg_df)] <- 0
print(mrg_tbl <- table(mrg_df$.marker, mrg_df$Popular))
print("Rows:Selected; Cols:Popular")
#print(mrg_tbl)
print(chisq.test(mrg_tbl))
}
# dsp_chisq.test(Headline.contains="[Ee]bola")
# dsp_chisq.test(Snippet.contains="[Ee]bola")
# dsp_chisq.test(Abstract.contains="[Ee]bola")
# print(mycreate_sqlxtab_df(glb_allobs_df[sel_obs(Headline.contains="[Ee]bola"), ],
# c(glb_rsp_var, "NewsDesk", "SectionName", "SubsectionName")))
# print(table(glb_allobs_df$NewsDesk, glb_allobs_df$SectionName))
# print(table(glb_allobs_df$SectionName, glb_allobs_df$SubsectionName))
# print(table(glb_allobs_df$NewsDesk, glb_allobs_df$SectionName, glb_allobs_df$SubsectionName))
# glb_allobs_df$myCategory.fctr <- as.factor(glb_allobs_df$myCategory)
# glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
# c("myCategory", "NewsDesk", "SectionName", "SubsectionName"))
# Copy Headline into Snipper & Abstract if they are empty
# print(glb_allobs_df[nchar(glb_allobs_df[, "Snippet"]) == 0, c("Headline", "Snippet")])
# print(glb_allobs_df[glb_allobs_df$Headline == glb_allobs_df$Snippet,
# c("UniqueID", "Headline", "Snippet")])
# glb_allobs_df[nchar(glb_allobs_df[, "Snippet"]) == 0, "Snippet"] <-
# glb_allobs_df[nchar(glb_allobs_df[, "Snippet"]) == 0, "Headline"]
#
# print(glb_allobs_df[nchar(glb_allobs_df[, "Abstract"]) == 0, c("Headline", "Abstract")])
# print(glb_allobs_df[glb_allobs_df$Headline == glb_allobs_df$Abstract,
# c("UniqueID", "Headline", "Abstract")])
# glb_allobs_df[nchar(glb_allobs_df[, "Abstract"]) == 0, "Abstract"] <-
# glb_allobs_df[nchar(glb_allobs_df[, "Abstract"]) == 0, "Headline"]
# WordCount_0_df <- subset(glb_allobs_df, WordCount == 0)
# table(WordCount_0_df$Popular, WordCount_0_df$WordCount, useNA="ifany")
# myprint_df(WordCount_0_df[,
# c("UniqueID", "Popular", "WordCount", "Headline")])
2.1: scrub dataglb_chunks_df <- myadd_chunk(glb_chunks_df, "transform.data", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 3 scrub.data 2 1 14.971 16.485 1.514
## 4 transform.data 2 2 16.486 NA NA
### Mapping dictionary
#sav_allobs_df <- glb_allobs_df; glb_allobs_df <- sav_allobs_df
if (!is.null(glb_map_vars)) {
for (feat in glb_map_vars) {
map_df <- myimport_data(url=glb_map_urls[[feat]],
comment="map_df",
print_diagn=TRUE)
glb_allobs_df <- mymap_codes(glb_allobs_df, feat, names(map_df)[2],
map_df, map_join_col_name=names(map_df)[1],
map_tgt_col_name=names(map_df)[2])
}
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, glb_map_vars)
}
### Forced Assignments
#stop(here"); sav_allobs_df <- glb_allobs_df; glb_allobs_df <- sav_allobs_df
for (feat in glb_assign_vars) {
new_feat <- paste0(feat, ".my")
print(sprintf("Forced Assignments for: %s -> %s...", feat, new_feat))
glb_allobs_df[, new_feat] <- glb_allobs_df[, feat]
pairs <- glb_assign_pairs_lst[[feat]]
for (pair_ix in 1:length(pairs$from)) {
if (is.na(pairs$from[pair_ix]))
nobs <- nrow(filter(glb_allobs_df,
is.na(eval(parse(text=feat),
envir=glb_allobs_df)))) else
nobs <- sum(glb_allobs_df[, feat] == pairs$from[pair_ix])
#nobs <- nrow(filter(glb_allobs_df, is.na(Married.fctr))) ; print(nobs)
if ((is.na(pairs$from[pair_ix])) && (is.na(pairs$to[pair_ix])))
stop("what are you trying to do ???")
if (is.na(pairs$from[pair_ix]))
glb_allobs_df[is.na(glb_allobs_df[, feat]), new_feat] <-
pairs$to[pair_ix] else
glb_allobs_df[glb_allobs_df[, feat] == pairs$from[pair_ix], new_feat] <-
pairs$to[pair_ix]
print(sprintf(" %s -> %s for %s obs",
pairs$from[pair_ix], pairs$to[pair_ix], format(nobs, big.mark=",")))
}
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, glb_assign_vars)
}
### Transformations using mapping functions
#stop(here"); sav_allobs_df <- glb_allobs_df; glb_allobs_df <- sav_allobs_df
for (feat in glb_transform_vars) {
new_feat <- paste0(feat, glb_transform_lst[[feat]]$sfx)
print(sprintf("Applying mapping function for: %s -> %s...", feat, new_feat))
glb_allobs_df[, new_feat] <- glb_transform_lst[[feat]]$mapfn(glb_allobs_df[, feat])
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, glb_transform_vars)
}
### Derivations using mapping functions
#stop(here"); sav_allobs_df <- glb_allobs_df; glb_allobs_df <- sav_allobs_df
for (new_feat in glb_derive_vars) {
print(sprintf("Creating new feature: %s...", new_feat))
args_lst <- NULL
for (arg in glb_derive_lst[[new_feat]]$args)
args_lst[[arg]] <- glb_allobs_df[, arg]
glb_allobs_df[, new_feat] <- do.call(glb_derive_lst[[new_feat]]$mapfn, args_lst)
}
2.2: transform dataglb_chunks_df <- myadd_chunk(glb_chunks_df, "manage.missing.data", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 4 transform.data 2 2 16.486 16.518 0.032
## 5 manage.missing.data 2 3 16.518 NA NA
# print(sapply(names(glb_trnobs_df), function(col) sum(is.na(glb_trnobs_df[, col]))))
# print(sapply(names(glb_newobs_df), function(col) sum(is.na(glb_newobs_df[, col]))))
# glb_trnobs_df <- na.omit(glb_trnobs_df)
# glb_newobs_df <- na.omit(glb_newobs_df)
# df[is.na(df)] <- 0
mycheck_problem_data(glb_allobs_df)
## [1] "numeric data missing in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ 0s in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ Infs in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ NaNs in glb_allobs_df: "
## named integer(0)
## [1] "string data missing in glb_allobs_df: "
## .rownames
## 0
# Not refactored into mydsutils.R since glb_*_df might be reassigned
glb_impute_missing_data <- function() {
require(mice)
set.seed(glb_mice_complete.seed)
inp_impent_df <- glb_allobs_df[, setdiff(names(glb_allobs_df),
union(glb_exclude_vars_as_features, glb_rsp_var))]
print("Summary before imputation: ")
print(summary(inp_impent_df))
out_impent_df <- complete(mice(inp_impent_df))
print(summary(out_impent_df))
# complete(mice()) changes attributes of factors even though values don't change
ret_vars <- sapply(names(out_impent_df),
function(col) ifelse(!identical(out_impent_df[, col], inp_impent_df[, col]),
col, ""))
ret_vars <- ret_vars[ret_vars != ""]
return(out_impent_df[, ret_vars])
}
if (glb_impute_na_data &&
(length(myfind_numerics_missing(glb_allobs_df)) > 0) &&
(ncol(nonna_df <- glb_impute_missing_data()) > 0)) {
for (col in names(nonna_df)) {
glb_allobs_df[, paste0(col, ".nonNA")] <- nonna_df[, col]
glb_exclude_vars_as_features <- c(glb_exclude_vars_as_features, col)
}
}
mycheck_problem_data(glb_allobs_df, terminate = TRUE)
## [1] "numeric data missing in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ 0s in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ Infs in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ NaNs in glb_allobs_df: "
## named integer(0)
## [1] "string data missing in glb_allobs_df: "
## .rownames
## 0
2.3: manage missing data#```{r extract_features, cache=FALSE, eval=!is.null(glb_txt_vars)}
glb_chunks_df <- myadd_chunk(glb_chunks_df, "extract.features", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 5 manage.missing.data 2 3 16.518 16.583 0.065
## 6 extract.features 3 0 16.583 NA NA
extract.features_chunk_df <- myadd_chunk(NULL, "extract.features_bgn")
## label step_major step_minor bgn end elapsed
## 1 extract.features_bgn 1 0 16.589 NA NA
# Options:
# Select Tf, log(1 + Tf), Tf-IDF or BM25Tf-IDf
# Create new features that help prediction
# <col_name>.lag.2 <- lag(zoo(glb_trnobs_df$<col_name>), -2, na.pad=TRUE)
# glb_trnobs_df[, "<col_name>.lag.2"] <- coredata(<col_name>.lag.2)
# <col_name>.lag.2 <- lag(zoo(glb_newobs_df$<col_name>), -2, na.pad=TRUE)
# glb_newobs_df[, "<col_name>.lag.2"] <- coredata(<col_name>.lag.2)
#
# glb_newobs_df[1, "<col_name>.lag.2"] <- glb_trnobs_df[nrow(glb_trnobs_df) - 1,
# "<col_name>"]
# glb_newobs_df[2, "<col_name>.lag.2"] <- glb_trnobs_df[nrow(glb_trnobs_df),
# "<col_name>"]
# glb_allobs_df <- mutate(glb_allobs_df,
# A.P.http=ifelse(grepl("http",Added,fixed=TRUE), 1, 0)
# )
#
# glb_trnobs_df <- mutate(glb_trnobs_df,
# )
#
# glb_newobs_df <- mutate(glb_newobs_df,
# )
# Create factors of string variables
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "factorize.str.vars"), major.inc=TRUE)
## label step_major step_minor bgn end
## 1 extract.features_bgn 1 0 16.589 16.597
## 2 extract.features_factorize.str.vars 2 0 16.597 NA
## elapsed
## 1 0.008
## 2 NA
#stop(here"); sav_allobs_df <- glb_allobs_df; #glb_allobs_df <- sav_allobs_df
print(str_vars <- myfind_chr_cols_df(glb_allobs_df))
## .rownames .src
## ".rownames" ".src"
if (length(str_vars <- setdiff(str_vars,
c(glb_exclude_vars_as_features, glb_txt_vars))) > 0) {
for (var in str_vars) {
warning("Creating factors of string variable: ", var,
": # of unique values: ", length(unique(glb_allobs_df[, var])))
glb_allobs_df[, paste0(var, ".fctr")] <- factor(glb_allobs_df[, var],
as.factor(unique(glb_allobs_df[, var])))
# glb_trnobs_df[, paste0(var, ".fctr")] <- factor(glb_trnobs_df[, var],
# as.factor(unique(glb_allobs_df[, var])))
# glb_newobs_df[, paste0(var, ".fctr")] <- factor(glb_newobs_df[, var],
# as.factor(unique(glb_allobs_df[, var])))
}
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, str_vars)
}
if (!is.null(glb_txt_vars)) {
require(foreach)
require(gsubfn)
require(stringr)
require(tm)
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "process.text"), major.inc=TRUE)
chk_pattern_freq <- function(rex_str, ignore.case=TRUE) {
match_mtrx <- str_extract_all(txt_vctr, regex(rex_str, ignore_case=ignore.case),
simplify=TRUE)
match_df <- as.data.frame(match_mtrx[match_mtrx != ""])
names(match_df) <- "pattern"
return(mycreate_sqlxtab_df(match_df, "pattern"))
}
# match_lst <- gregexpr("\\bok(?!ay)", txt_vctr[746], ignore.case = FALSE, perl=TRUE); print(match_lst)
dsp_pattern <- function(rex_str, ignore.case=TRUE, print.all=TRUE) {
match_lst <- gregexpr(rex_str, txt_vctr, ignore.case = ignore.case, perl=TRUE)
match_lst <- regmatches(txt_vctr, match_lst)
match_df <- data.frame(matches=sapply(match_lst,
function (elems) paste(elems, collapse="#")))
match_df <- subset(match_df, matches != "")
if (print.all)
print(match_df)
return(match_df)
}
dsp_matches <- function(rex_str, ix) {
print(match_pos <- gregexpr(rex_str, txt_vctr[ix], perl=TRUE))
print(str_sub(txt_vctr[ix], (match_pos[[1]] / 100) * 99 + 0,
(match_pos[[1]] / 100) * 100 + 100))
}
myapply_gsub <- function(...) {
if ((length_lst <- length(names(gsub_map_lst))) == 0)
return(txt_vctr)
for (ptn_ix in 1:length_lst) {
if ((ptn_ix %% 10) == 0)
print(sprintf("running gsub for %02d (of %02d): #%s#...", ptn_ix,
length(names(gsub_map_lst)), names(gsub_map_lst)[ptn_ix]))
txt_vctr <- gsub(names(gsub_map_lst)[ptn_ix], gsub_map_lst[[ptn_ix]],
txt_vctr, ...)
}
return(txt_vctr)
}
myapply_txtmap <- function(txt_vctr, ...) {
nrows <- nrow(glb_txt_map_df)
for (ptn_ix in 1:nrows) {
if ((ptn_ix %% 10) == 0)
print(sprintf("running gsub for %02d (of %02d): #%s#...", ptn_ix,
nrows, glb_txt_map_df[ptn_ix, "rex_str"]))
txt_vctr <- gsub(glb_txt_map_df[ptn_ix, "rex_str"],
glb_txt_map_df[ptn_ix, "rpl_str"],
txt_vctr, ...)
}
return(txt_vctr)
}
chk.equal <- function(bgn, end) {
print(all.equal(sav_txt_lst[["Headline"]][bgn:end],
glb_txt_lst[["Headline"]][bgn:end]))
}
dsp.equal <- function(bgn, end) {
print(sav_txt_lst[["Headline"]][bgn:end])
print(glb_txt_lst[["Headline"]][bgn:end])
}
#sav_txt_lst <- glb_txt_lst; all.equal(sav_txt_lst, glb_txt_lst)
#all.equal(sav_txt_lst[["Headline"]][1:4200], glb_txt_lst[["Headline"]][1:4200])
#chk.equal( 1, 100)
#dsp.equal(86, 90)
glb_txt_map_df <- read.csv("mytxt_map.csv", comment.char="#", strip.white=TRUE)
glb_txt_lst <- list();
print(sprintf("Building glb_txt_lst..."))
glb_txt_lst <- foreach(txt_var=glb_txt_vars) %dopar% {
# for (txt_var in glb_txt_vars) {
txt_vctr <- glb_allobs_df[, txt_var]
# myapply_txtmap shd be created as a tm_map::content_transformer ?
#print(glb_txt_map_df)
#txt_var=glb_txt_vars[3]; txt_vctr <- glb_txt_lst[[txt_var]]
#print(rex_str <- glb_txt_map_df[163, "rex_str"])
#print(rex_str <- glb_txt_map_df[glb_txt_map_df$rex_str == "\\bWall St\\.", "rex_str"])
#print(rex_str <- glb_txt_map_df[grepl("du Pont", glb_txt_map_df$rex_str), "rex_str"])
#print(rex_str <- glb_txt_map_df[glb_txt_map_df$rpl_str == "versus", "rex_str"])
#print(tmp_vctr <- grep(rex_str, txt_vctr, value=TRUE, ignore.case=FALSE))
#ret_lst <- regexec(rex_str, txt_vctr, ignore.case=FALSE); ret_lst <- regmatches(txt_vctr, ret_lst); ret_vctr <- sapply(1:length(ret_lst), function(pos_ix) ifelse(length(ret_lst[[pos_ix]]) > 0, ret_lst[[pos_ix]], "")); print(ret_vctr <- ret_vctr[ret_vctr != ""])
#gsub(rex_str, glb_txt_map_df[glb_txt_map_df$rex_str == rex_str, "rpl_str"], tmp_vctr, ignore.case=FALSE)
#grep("Hong Hong", txt_vctr, value=TRUE)
txt_vctr <- myapply_txtmap(txt_vctr, ignore.case=FALSE)
}
names(glb_txt_lst) <- glb_txt_vars
for (txt_var in glb_txt_vars) {
print(sprintf("Remaining OK in %s:", txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
print(chk_pattern_freq(rex_str <- "(?<!(BO|HO|LO))OK(?!(E\\!|ED|IE|IN|S ))",
ignore.case=FALSE))
match_df <- dsp_pattern(rex_str, ignore.case=FALSE, print.all=FALSE)
for (row in row.names(match_df))
dsp_matches(rex_str, ix=as.numeric(row))
print(chk_pattern_freq(rex_str <- "Ok(?!(a\\.|ay|in|ra|um))", ignore.case=FALSE))
match_df <- dsp_pattern(rex_str, ignore.case=FALSE, print.all=FALSE)
for (row in row.names(match_df))
dsp_matches(rex_str, ix=as.numeric(row))
print(chk_pattern_freq(rex_str <- "(?<!( b| B| c| C| g| G| j| M| p| P| w| W| r| Z|\\(b|ar|bo|Bo|co|Co|Ew|gk|go|ho|ig|jo|kb|ke|Ke|ki|lo|Lo|mo|mt|no|No|po|ra|ro|sm|Sm|Sp|to|To))ok(?!(ay|bo|e |e\\)|e,|e\\.|eb|ed|el|en|er|es|ey|i |ie|in|it|ka|ke|ki|ly|on|oy|ra|st|u |uc|uy|yl|yo))",
ignore.case=FALSE))
match_df <- dsp_pattern(rex_str, ignore.case=FALSE, print.all=FALSE)
for (row in row.names(match_df))
dsp_matches(rex_str, ix=as.numeric(row))
}
# txt_vctr <- glb_txt_lst[[glb_txt_vars[1]]]
# print(chk_pattern_freq(rex_str <- "(?<!( b| c| C| p|\\(b|bo|co|lo|Lo|Sp|to|To))ok(?!(ay|e |e\\)|e,|e\\.|ed|el|en|es|ey|ie|in|on|ra))", ignore.case=FALSE))
# print(chk_pattern_freq(rex_str <- "ok(?!(ay|el|on|ra))", ignore.case=FALSE))
# dsp_pattern(rex_str, ignore.case=FALSE, print.all=FALSE)
# dsp_matches(rex_str, ix=8)
# substr(txt_vctr[86], 5613, 5620)
# substr(glb_allobs_df[301, "review"], 550, 650)
#stop(here"); sav_txt_lst <- glb_txt_lst
for (txt_var in glb_txt_vars) {
print(sprintf("Remaining Acronyms in %s:", txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
print(chk_pattern_freq(rex_str <- "([[:upper:]]\\.( *)){2,}", ignore.case=FALSE))
# Check for names
print(subset(chk_pattern_freq(rex_str <- "(([[:upper:]]+)\\.( *)){1}",
ignore.case=FALSE),
.n > 1))
# dsp_pattern(rex_str="(OK\\.( *)){1}", ignore.case=FALSE)
# dsp_matches(rex_str="(OK\\.( *)){1}", ix=557)
#dsp_matches(rex_str="\\bR\\.I\\.P(\\.*)(\\B)", ix=461)
#dsp_matches(rex_str="\\bR\\.I\\.P(\\.*)", ix=461)
#print(str_sub(txt_vctr[676], 10100, 10200))
#print(str_sub(txt_vctr[74], 1, -1))
}
for (txt_var in glb_txt_vars) {
re_str <- "\\b(Fort|Ft\\.|Hong|Las|Los|New|Puerto|Saint|San|St\\.)( |-)(\\w)+"
print(sprintf("Remaining #%s# terms in %s: ", re_str, txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
print(orderBy(~ -.n +pattern, subset(chk_pattern_freq(re_str, ignore.case=FALSE),
grepl("( |-)[[:upper:]]", pattern))))
print(" consider cleaning if relevant to problem domain; geography name; .n > 1")
#grep("New G", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("St\\. Wins", txt_vctr, value=TRUE, ignore.case=FALSE)
}
#stop(here"); sav_txt_lst <- glb_txt_lst
for (txt_var in glb_txt_vars) {
re_str <- "\\b(N|S|E|W|C)( |\\.)(\\w)+"
print(sprintf("Remaining #%s# terms in %s: ", re_str, txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
print(orderBy(~ -.n +pattern, subset(chk_pattern_freq(re_str, ignore.case=FALSE),
grepl(".", pattern))))
#grep("N Weaver", txt_vctr, value=TRUE, ignore.case=FALSE)
}
for (txt_var in glb_txt_vars) {
re_str <- "\\b(North|South|East|West|Central)( |\\.)(\\w)+"
print(sprintf("Remaining #%s# terms in %s: ", re_str, txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
print(orderBy(~ -.n +pattern, subset(chk_pattern_freq(re_str, ignore.case=FALSE),
grepl(".", pattern))))
#grep("Central (African|Bankers|Cast|Italy|Role|Spring)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("East (Africa|Berlin|London|Poland|Rivals|Spring)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("North (American|Korean|West)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("South (Pacific|Street)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("St\\. Martins", txt_vctr, value=TRUE, ignore.case=FALSE)
}
find_cmpnd_wrds <- function(txt_vctr) {
txt_corpus <- Corpus(VectorSource(txt_vctr))
txt_corpus <- tm_map(txt_corpus, tolower)
txt_corpus <- tm_map(txt_corpus, PlainTextDocument)
txt_corpus <- tm_map(txt_corpus, removePunctuation,
preserve_intra_word_dashes=TRUE)
full_Tf_DTM <- DocumentTermMatrix(txt_corpus,
control=list(weighting=weightTf))
print(" Full TermMatrix:"); print(full_Tf_DTM)
full_Tf_mtrx <- as.matrix(full_Tf_DTM)
rownames(full_Tf_mtrx) <- rownames(glb_allobs_df) # print undreadable otherwise
full_Tf_vctr <- colSums(full_Tf_mtrx)
names(full_Tf_vctr) <- dimnames(full_Tf_DTM)[[2]]
#grep("year", names(full_Tf_vctr), value=TRUE)
#which.max(full_Tf_mtrx[, "yearlong"])
full_Tf_df <- as.data.frame(full_Tf_vctr)
names(full_Tf_df) <- "Tf.full"
full_Tf_df$term <- rownames(full_Tf_df)
#full_Tf_df$freq.full <- colSums(full_Tf_mtrx != 0)
full_Tf_df <- orderBy(~ -Tf.full, full_Tf_df)
cmpnd_Tf_df <- full_Tf_df[grep("-", full_Tf_df$term, value=TRUE) ,]
filter_df <- read.csv("mytxt_compound.csv", comment.char="#", strip.white=TRUE)
cmpnd_Tf_df$filter <- FALSE
for (row_ix in 1:nrow(filter_df))
cmpnd_Tf_df[!cmpnd_Tf_df$filter, "filter"] <-
grepl(filter_df[row_ix, "rex_str"],
cmpnd_Tf_df[!cmpnd_Tf_df$filter, "term"], ignore.case=TRUE)
cmpnd_Tf_df <- subset(cmpnd_Tf_df, !filter)
# Bug in tm_map(txt_corpus, removePunctuation, preserve_intra_word_dashes=TRUE) ???
# "net-a-porter" gets converted to "net-aporter"
#grep("net-a-porter", txt_vctr, ignore.case=TRUE, value=TRUE)
#grep("maser-laser", txt_vctr, ignore.case=TRUE, value=TRUE)
#txt_corpus[[which(grepl("net-a-porter", txt_vctr, ignore.case=TRUE))]]
#grep("\\b(across|longer)-(\\w)", cmpnd_Tf_df$term, ignore.case=TRUE, value=TRUE)
#grep("(\\w)-(affected|term)\\b", cmpnd_Tf_df$term, ignore.case=TRUE, value=TRUE)
print(sprintf("nrow(cmpnd_Tf_df): %d", nrow(cmpnd_Tf_df)))
myprint_df(cmpnd_Tf_df)
}
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "process.text_reporting_compound_terms"), major.inc=FALSE)
for (txt_var in glb_txt_vars) {
print(sprintf("Remaining compound terms in %s: ", txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
# find_cmpnd_wrds(txt_vctr)
#grep("thirty-five", txt_vctr, ignore.case=TRUE, value=TRUE)
#rex_str <- glb_txt_map_df[grepl("hirty", glb_txt_map_df$rex_str), "rex_str"]
}
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "build.corpus"), major.inc=TRUE)
glb_corpus_lst <- list()
print(sprintf("Building glb_corpus_lst..."))
glb_corpus_lst <- foreach(txt_var=glb_txt_vars) %dopar% {
# for (txt_var in glb_txt_vars) {
txt_corpus <- Corpus(VectorSource(glb_txt_lst[[txt_var]]))
txt_corpus <- tm_map(txt_corpus, tolower) #nuppr
txt_corpus <- tm_map(txt_corpus, PlainTextDocument)
txt_corpus <- tm_map(txt_corpus, removePunctuation) #npnct<chr_ix>
# txt-corpus <- tm_map(txt_corpus, content_transformer(function(x, pattern) gsub(pattern, "", x))
# Not to be run in production
inspect_terms <- function() {
full_Tf_DTM <- DocumentTermMatrix(txt_corpus,
control=list(weighting=weightTf))
print(" Full TermMatrix:"); print(full_Tf_DTM)
full_Tf_mtrx <- as.matrix(full_Tf_DTM)
rownames(full_Tf_mtrx) <- rownames(glb_allobs_df) # print undreadable otherwise
full_Tf_vctr <- colSums(full_Tf_mtrx)
names(full_Tf_vctr) <- dimnames(full_Tf_DTM)[[2]]
#grep("year", names(full_Tf_vctr), value=TRUE)
#which.max(full_Tf_mtrx[, "yearlong"])
full_Tf_df <- as.data.frame(full_Tf_vctr)
names(full_Tf_df) <- "Tf.full"
full_Tf_df$term <- rownames(full_Tf_df)
#full_Tf_df$freq.full <- colSums(full_Tf_mtrx != 0)
full_Tf_df <- orderBy(~ -Tf.full +term, full_Tf_df)
print(myplot_histogram(full_Tf_df, "Tf.full"))
myprint_df(full_Tf_df)
#txt_corpus[[which(grepl("zun", txt_vctr, ignore.case=TRUE))]]
digit_terms_df <- subset(full_Tf_df, grepl("[[:digit:]]", term))
myprint_df(digit_terms_df)
return(full_Tf_df)
}
#print("RemovePunct:"); remove_punct_Tf_df <- inspect_terms()
txt_corpus <- tm_map(txt_corpus, removeWords,
c(glb_append_stop_words[[txt_var]],
stopwords("english"))) #nstopwrds
#print("StoppedWords:"); stopped_words_Tf_df <- inspect_terms()
txt_corpus <- tm_map(txt_corpus, stemDocument) #Features for lost information: Difference/ratio in density of full_TfIdf_DTM ???
#txt_corpus <- tm_map(txt_corpus, content_transformer(stemDocument))
#print("StemmedWords:"); stemmed_words_Tf_df <- inspect_terms()
#stemmed_stopped_Tf_df <- merge(stemmed_words_Tf_df, stopped_words_Tf_df, by="term", all=TRUE, suffixes=c(".stem", ".stop"))
#myprint_df(stemmed_stopped_Tf_df)
#print(subset(stemmed_stopped_Tf_df, grepl("compan", term)))
#glb_corpus_lst[[txt_var]] <- txt_corpus
}
names(glb_corpus_lst) <- glb_txt_vars
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "extract.DTM"), major.inc=TRUE)
glb_full_DTM_lst <- list(); glb_sprs_DTM_lst <- list();
for (txt_var in glb_txt_vars) {
print(sprintf("Extracting TfIDf terms for %s...", txt_var))
txt_corpus <- glb_corpus_lst[[txt_var]]
# full_Tf_DTM <- DocumentTermMatrix(txt_corpus,
# control=list(weighting=weightTf))
full_TfIdf_DTM <- DocumentTermMatrix(txt_corpus,
control=list(weighting=weightTfIdf))
sprs_TfIdf_DTM <- removeSparseTerms(full_TfIdf_DTM,
glb_sprs_thresholds[txt_var])
# glb_full_DTM_lst[[txt_var]] <- full_Tf_DTM
# glb_sprs_DTM_lst[[txt_var]] <- sprs_Tf_DTM
glb_full_DTM_lst[[txt_var]] <- full_TfIdf_DTM
glb_sprs_DTM_lst[[txt_var]] <- sprs_TfIdf_DTM
}
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "report.DTM"), major.inc=TRUE)
for (txt_var in glb_txt_vars) {
print(sprintf("Reporting TfIDf terms for %s...", txt_var))
full_TfIdf_DTM <- glb_full_DTM_lst[[txt_var]]
sprs_TfIdf_DTM <- glb_sprs_DTM_lst[[txt_var]]
print(" Full TermMatrix:"); print(full_TfIdf_DTM)
full_TfIdf_mtrx <- as.matrix(full_TfIdf_DTM)
rownames(full_TfIdf_mtrx) <- rownames(glb_allobs_df) # print undreadable otherwise
full_TfIdf_vctr <- colSums(full_TfIdf_mtrx)
names(full_TfIdf_vctr) <- dimnames(full_TfIdf_DTM)[[2]]
#grep("scene", names(full_TfIdf_vctr), value=TRUE)
#which.max(full_TfIdf_mtrx[, "yearlong"])
full_TfIdf_df <- as.data.frame(full_TfIdf_vctr)
names(full_TfIdf_df) <- "TfIdf.full"
full_TfIdf_df$term <- rownames(full_TfIdf_df)
full_TfIdf_df$freq.full <- colSums(full_TfIdf_mtrx != 0)
full_TfIdf_df <- orderBy(~ -TfIdf.full, full_TfIdf_df)
print(" Sparse TermMatrix:"); print(sprs_TfIdf_DTM)
sprs_TfIdf_vctr <- colSums(as.matrix(sprs_TfIdf_DTM))
names(sprs_TfIdf_vctr) <- dimnames(sprs_TfIdf_DTM)[[2]]
sprs_TfIdf_df <- as.data.frame(sprs_TfIdf_vctr)
names(sprs_TfIdf_df) <- "TfIdf.sprs"
sprs_TfIdf_df$term <- rownames(sprs_TfIdf_df)
sprs_TfIdf_df$freq.sprs <- colSums(as.matrix(sprs_TfIdf_DTM) != 0)
sprs_TfIdf_df <- orderBy(~ -TfIdf.sprs, sprs_TfIdf_df)
terms_TfIdf_df <- merge(full_TfIdf_df, sprs_TfIdf_df, all.x=TRUE)
terms_TfIdf_df$in.sprs <- !is.na(terms_TfIdf_df$freq.sprs)
plt_TfIdf_df <- subset(terms_TfIdf_df,
TfIdf.full >= min(terms_TfIdf_df$TfIdf.sprs, na.rm=TRUE))
plt_TfIdf_df$label <- ""
plt_TfIdf_df[is.na(plt_TfIdf_df$TfIdf.sprs), "label"] <-
plt_TfIdf_df[is.na(plt_TfIdf_df$TfIdf.sprs), "term"]
glb_important_terms[[txt_var]] <- union(glb_important_terms[[txt_var]],
plt_TfIdf_df[is.na(plt_TfIdf_df$TfIdf.sprs), "term"])
print(myplot_scatter(plt_TfIdf_df, "freq.full", "TfIdf.full",
colorcol_name="in.sprs") +
geom_text(aes(label=label), color="Black", size=3.5))
melt_TfIdf_df <- orderBy(~ -value, melt(terms_TfIdf_df, id.var="term"))
print(ggplot(melt_TfIdf_df, aes(value, color=variable)) + stat_ecdf() +
geom_hline(yintercept=glb_sprs_thresholds[txt_var],
linetype = "dotted"))
melt_TfIdf_df <- orderBy(~ -value,
melt(subset(terms_TfIdf_df, !is.na(TfIdf.sprs)), id.var="term"))
print(myplot_hbar(melt_TfIdf_df, "term", "value",
colorcol_name="variable"))
melt_TfIdf_df <- orderBy(~ -value,
melt(subset(terms_TfIdf_df, is.na(TfIdf.sprs)), id.var="term"))
print(myplot_hbar(head(melt_TfIdf_df, 10), "term", "value",
colorcol_name="variable"))
}
# sav_full_DTM_lst <- glb_full_DTM_lst
# sav_sprs_DTM_lst <- glb_sprs_DTM_lst
# print(identical(sav_glb_corpus_lst, glb_corpus_lst))
# print(all.equal(length(sav_glb_corpus_lst), length(glb_corpus_lst)))
# print(all.equal(names(sav_glb_corpus_lst), names(glb_corpus_lst)))
# print(all.equal(sav_glb_corpus_lst[["Headline"]], glb_corpus_lst[["Headline"]]))
# print(identical(sav_full_DTM_lst, glb_full_DTM_lst))
# print(identical(sav_sprs_DTM_lst, glb_sprs_DTM_lst))
rm(full_TfIdf_mtrx, full_TfIdf_df, melt_TfIdf_df, terms_TfIdf_df)
# Create txt features
if ((length(glb_txt_vars) > 1) &&
(length(unique(pfxs <- sapply(glb_txt_vars,
function(txt) toupper(substr(txt, 1, 1))))) < length(glb_txt_vars)))
stop("Prefixes for corpus freq terms not unique: ", pfxs)
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "bind.DTM"),
major.inc=TRUE)
for (txt_var in glb_txt_vars) {
print(sprintf("Binding DTM for %s...", txt_var))
txt_var_pfx <- toupper(substr(txt_var, 1, 1))
txt_X_df <- as.data.frame(as.matrix(glb_sprs_DTM_lst[[txt_var]]))
colnames(txt_X_df) <- paste(txt_var_pfx, ".T.",
make.names(colnames(txt_X_df)), sep="")
rownames(txt_X_df) <- rownames(glb_allobs_df) # warning otherwise
# plt_X_df <- cbind(txt_X_df, glb_allobs_df[, c(glb_id_var, glb_rsp_var)])
# print(myplot_box(df=plt_X_df, ycol_names="H.T.today", xcol_name=glb_rsp_var))
# log_X_df <- log(1 + txt_X_df)
# colnames(log_X_df) <- paste(colnames(txt_X_df), ".log", sep="")
# plt_X_df <- cbind(log_X_df, glb_allobs_df[, c(glb_id_var, glb_rsp_var)])
# print(myplot_box(df=plt_X_df, ycol_names="H.T.today.log", xcol_name=glb_rsp_var))
glb_allobs_df <- cbind(glb_allobs_df, txt_X_df) # TfIdf is normalized
#glb_allobs_df <- cbind(glb_allobs_df, log_X_df) # if using non-normalized metrics
}
#identical(chk_entity_df, glb_allobs_df)
#chk_entity_df <- glb_allobs_df
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "bind.DXM"),
major.inc=TRUE)
#sav_allobs_df <- glb_allobs_df
glb_punct_vctr <- c("!", "\"", "#", "\\$", "%", "&", "'",
"\\(|\\)",# "\\(", "\\)",
"\\*", "\\+", ",", "-", "\\.", "/", ":", ";",
"<|>", # "<",
"=",
# ">",
"\\?", "@", "\\[", "\\\\", "\\]", "^", "_", "`",
"\\{", "\\|", "\\}", "~")
txt_X_df <- glb_allobs_df[, c(glb_id_var, ".rnorm"), FALSE]
txt_X_df <- foreach(txt_var=glb_txt_vars, .combine=cbind) %dopar% {
#for (txt_var in glb_txt_vars) {
print(sprintf("Binding DXM for %s...", txt_var))
txt_var_pfx <- toupper(substr(txt_var, 1, 1))
#txt_X_df <- glb_allobs_df[, c(glb_id_var, ".rnorm"), FALSE]
txt_full_DTM_mtrx <- as.matrix(glb_full_DTM_lst[[txt_var]])
rownames(txt_full_DTM_mtrx) <- rownames(glb_allobs_df) # print undreadable otherwise
#print(txt_full_DTM_mtrx[txt_full_DTM_mtrx[, "ebola"] != 0, "ebola"])
# Create <txt_var>.T.<term> for glb_important_terms
for (term in glb_important_terms[[txt_var]])
txt_X_df[, paste0(txt_var_pfx, ".T.", make.names(term))] <-
txt_full_DTM_mtrx[, term]
# Create <txt_var>.nwrds.log & .nwrds.unq.log
txt_X_df[, paste0(txt_var_pfx, ".nwrds.log")] <-
log(1 + mycount_pattern_occ("\\w+", glb_txt_lst[[txt_var]]))
txt_X_df[, paste0(txt_var_pfx, ".nwrds.unq.log")] <-
log(1 + rowSums(txt_full_DTM_mtrx != 0))
txt_X_df[, paste0(txt_var_pfx, ".sum.TfIdf")] <-
rowSums(txt_full_DTM_mtrx)
txt_X_df[, paste0(txt_var_pfx, ".ratio.sum.TfIdf.nwrds")] <-
txt_X_df[, paste0(txt_var_pfx, ".sum.TfIdf")] /
(exp(txt_X_df[, paste0(txt_var_pfx, ".nwrds.log")]) - 1)
txt_X_df[is.nan(txt_X_df[, paste0(txt_var_pfx, ".ratio.sum.TfIdf.nwrds")]),
paste0(txt_var_pfx, ".ratio.sum.TfIdf.nwrds")] <- 0
# Create <txt_var>.nchrs.log
txt_X_df[, paste0(txt_var_pfx, ".nchrs.log")] <-
log(1 + mycount_pattern_occ(".", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".nuppr.log")] <-
log(1 + mycount_pattern_occ("[[:upper:]]", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".ndgts.log")] <-
log(1 + mycount_pattern_occ("[[:digit:]]", glb_allobs_df[, txt_var]))
# Create <txt_var>.npnct?.log
# would this be faster if it's iterated over each row instead of
# each created column ???
for (punct_ix in 1:length(glb_punct_vctr)) {
# smp0 <- " "
# smp1 <- "! \" # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~"
# smp2 <- paste(smp1, smp1, sep=" ")
# print(sprintf("Testing %s pattern:", glb_punct_vctr[punct_ix]))
# results <- mycount_pattern_occ(glb_punct_vctr[punct_ix], c(smp0, smp1, smp2))
# names(results) <- NULL; print(results)
txt_X_df[,
paste0(txt_var_pfx, ".npnct", sprintf("%02d", punct_ix), ".log")] <-
log(1 + mycount_pattern_occ(glb_punct_vctr[punct_ix],
glb_allobs_df[, txt_var]))
}
# print(head(glb_allobs_df[glb_allobs_df[, "A.npnct23.log"] > 0,
# c("UniqueID", "Popular", "Abstract", "A.npnct23.log")]))
# Create <txt_var>.nstopwrds.log & <txt_var>ratio.nstopwrds.nwrds
stop_words_rex_str <- paste0("\\b(", paste0(c(glb_append_stop_words[[txt_var]],
stopwords("english")), collapse="|"),
")\\b")
txt_X_df[, paste0(txt_var_pfx, ".nstopwrds", ".log")] <-
log(1 + mycount_pattern_occ(stop_words_rex_str, glb_txt_lst[[txt_var]]))
txt_X_df[, paste0(txt_var_pfx, ".ratio.nstopwrds.nwrds")] <-
exp(txt_X_df[, paste0(txt_var_pfx, ".nstopwrds", ".log")] -
txt_X_df[, paste0(txt_var_pfx, ".nwrds", ".log")])
# Create <txt_var>.P.http
txt_X_df[, paste(txt_var_pfx, ".P.http", sep="")] <-
as.integer(0 + mycount_pattern_occ("http", glb_allobs_df[, txt_var]))
txt_X_df <- subset(txt_X_df, select=-.rnorm)
txt_X_df <- txt_X_df[, -grep(glb_id_var, names(txt_X_df), fixed=TRUE), FALSE]
#glb_allobs_df <- cbind(glb_allobs_df, txt_X_df)
}
glb_allobs_df <- cbind(glb_allobs_df, txt_X_df)
#myplot_box(glb_allobs_df, "A.sum.TfIdf", glb_rsp_var)
# Generate summaries
# print(summary(glb_allobs_df))
# print(sapply(names(glb_allobs_df), function(col) sum(is.na(glb_allobs_df[, col]))))
# print(summary(glb_trnobs_df))
# print(sapply(names(glb_trnobs_df), function(col) sum(is.na(glb_trnobs_df[, col]))))
# print(summary(glb_newobs_df))
# print(sapply(names(glb_newobs_df), function(col) sum(is.na(glb_newobs_df[, col]))))
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
glb_txt_vars)
rm(log_X_df, txt_X_df)
}
# print(sapply(names(glb_trnobs_df), function(col) sum(is.na(glb_trnobs_df[, col]))))
# print(sapply(names(glb_newobs_df), function(col) sum(is.na(glb_newobs_df[, col]))))
# print(myplot_scatter(glb_trnobs_df, "<col1_name>", "<col2_name>", smooth=TRUE))
rm(corpus_lst, full_TfIdf_DTM, full_TfIdf_vctr,
glb_full_DTM_lst, glb_sprs_DTM_lst, txt_corpus, txt_vctr)
## Warning in rm(corpus_lst, full_TfIdf_DTM, full_TfIdf_vctr,
## glb_full_DTM_lst, : object 'corpus_lst' not found
## Warning in rm(corpus_lst, full_TfIdf_DTM, full_TfIdf_vctr,
## glb_full_DTM_lst, : object 'full_TfIdf_DTM' not found
## Warning in rm(corpus_lst, full_TfIdf_DTM, full_TfIdf_vctr,
## glb_full_DTM_lst, : object 'full_TfIdf_vctr' not found
## Warning in rm(corpus_lst, full_TfIdf_DTM, full_TfIdf_vctr,
## glb_full_DTM_lst, : object 'glb_full_DTM_lst' not found
## Warning in rm(corpus_lst, full_TfIdf_DTM, full_TfIdf_vctr,
## glb_full_DTM_lst, : object 'glb_sprs_DTM_lst' not found
## Warning in rm(corpus_lst, full_TfIdf_DTM, full_TfIdf_vctr,
## glb_full_DTM_lst, : object 'txt_corpus' not found
## Warning in rm(corpus_lst, full_TfIdf_DTM, full_TfIdf_vctr,
## glb_full_DTM_lst, : object 'txt_vctr' not found
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df, "extract.features_end",
major.inc=TRUE)
## label step_major step_minor bgn end
## 2 extract.features_factorize.str.vars 2 0 16.597 16.614
## 3 extract.features_end 3 0 16.614 NA
## elapsed
## 2 0.017
## 3 NA
myplt_chunk(extract.features_chunk_df)
## label step_major step_minor bgn end
## 2 extract.features_factorize.str.vars 2 0 16.597 16.614
## 1 extract.features_bgn 1 0 16.589 16.597
## elapsed duration
## 2 0.017 0.017
## 1 0.008 0.008
## [1] "Total Elapsed Time: 16.614 secs"
# if (glb_save_envir)
# save(glb_feats_df,
# glb_allobs_df, #glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
# file=paste0(glb_out_pfx, "extract_features_dsk.RData"))
# load(paste0(glb_out_pfx, "extract_features_dsk.RData"))
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all","data.new")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
glb_chunks_df <- myadd_chunk(glb_chunks_df, "cluster.data", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 6 extract.features 3 0 16.583 17.753 1.17
## 7 cluster.data 4 0 17.753 NA NA
4.0: cluster dataif (glb_cluster) {
require(proxy)
#require(hash)
require(dynamicTreeCut)
# glb_hash <- hash(key=unique(glb_allobs_df$myCategory),
# values=1:length(unique(glb_allobs_df$myCategory)))
# glb_hash_lst <- hash(key=unique(glb_allobs_df$myCategory),
# values=1:length(unique(glb_allobs_df$myCategory)))
#stophere; sav_allobs_df <- glb_allobs_df;
print("Clustering features: ")
print(cluster_vars <- grep("[HSA]\\.[PT]\\.", names(glb_allobs_df), value=TRUE))
#print(cluster_vars <- grep("[HSA]\\.", names(glb_allobs_df), value=TRUE))
glb_allobs_df$.clusterid <- 1
#print(max(table(glb_allobs_df$myCategory.fctr) / 20))
for (myCategory in c("##", "Business#Business Day#Dealbook", "OpEd#Opinion#",
"Styles#U.S.#", "Business#Technology#", "Science#Health#",
"Culture#Arts#")) {
ctgry_allobs_df <- glb_allobs_df[glb_allobs_df$myCategory == myCategory, ]
dstns_dist <- dist(ctgry_allobs_df[, cluster_vars], method = "cosine")
dstns_mtrx <- as.matrix(dstns_dist)
print(sprintf("max distance(%0.4f) pair:", max(dstns_mtrx)))
row_ix <- ceiling(which.max(dstns_mtrx) / ncol(dstns_mtrx))
col_ix <- which.max(dstns_mtrx[row_ix, ])
print(ctgry_allobs_df[c(row_ix, col_ix),
c("UniqueID", "Popular", "myCategory", "Headline", cluster_vars)])
min_dstns_mtrx <- dstns_mtrx
diag(min_dstns_mtrx) <- 1
print(sprintf("min distance(%0.4f) pair:", min(min_dstns_mtrx)))
row_ix <- ceiling(which.min(min_dstns_mtrx) / ncol(min_dstns_mtrx))
col_ix <- which.min(min_dstns_mtrx[row_ix, ])
print(ctgry_allobs_df[c(row_ix, col_ix),
c("UniqueID", "Popular", "myCategory", "Headline", cluster_vars)])
clusters <- hclust(dstns_dist, method = "ward.D2")
#plot(clusters, labels=NULL, hang=-1)
myplclust(clusters, lab.col=unclass(ctgry_allobs_df[, glb_rsp_var]))
#clusterGroups = cutree(clusters, k=7)
clusterGroups <- cutreeDynamic(clusters, minClusterSize=20, method="tree", deepSplit=0)
# Unassigned groups are labeled 0; the largest group has label 1
table(clusterGroups, ctgry_allobs_df[, glb_rsp_var], useNA="ifany")
#print(ctgry_allobs_df[which(clusterGroups == 1), c("UniqueID", "Popular", "Headline")])
#print(ctgry_allobs_df[(clusterGroups == 1) & !is.na(ctgry_allobs_df$Popular) & (ctgry_allobs_df$Popular==1), c("UniqueID", "Popular", "Headline")])
clusterGroups[clusterGroups == 0] <- 1
table(clusterGroups, ctgry_allobs_df[, glb_rsp_var], useNA="ifany")
#summary(factor(clusterGroups))
# clusterGroups <- clusterGroups +
# 100 * # has to be > max(table(glb_allobs_df$myCategory.fctr) / minClusterSize=20)
# which(levels(glb_allobs_df$myCategory.fctr) == myCategory)
# table(clusterGroups, ctgry_allobs_df[, glb_rsp_var], useNA="ifany")
# add to glb_allobs_df - then split the data again
glb_allobs_df[glb_allobs_df$myCategory==myCategory,]$.clusterid <- clusterGroups
#print(unique(glb_allobs_df$.clusterid))
#print(glb_feats_df[glb_feats_df$id == ".clusterid.fctr", ])
}
ctgry_xtab_df <- orderBy(reformulate(c("-", ".n")),
mycreate_sqlxtab_df(glb_allobs_df,
c("myCategory", ".clusterid", glb_rsp_var)))
ctgry_cast_df <- orderBy(~ -Y -NA, dcast(ctgry_xtab_df,
myCategory + .clusterid ~
Popular.fctr, sum, value.var=".n"))
print(ctgry_cast_df)
#print(orderBy(~ myCategory -Y -NA, ctgry_cast_df))
# write.table(ctgry_cast_df, paste0(glb_out_pfx, "ctgry_clst.csv"),
# row.names=FALSE)
print(ctgry_sum_tbl <- table(glb_allobs_df$myCategory, glb_allobs_df$.clusterid,
glb_allobs_df[, glb_rsp_var],
useNA="ifany"))
# dsp_obs(.clusterid=1, myCategory="OpEd#Opinion#",
# cols=c("UniqueID", "Popular", "myCategory", ".clusterid", "Headline"),
# all=TRUE)
glb_allobs_df$.clusterid.fctr <- as.factor(glb_allobs_df$.clusterid)
glb_exclude_vars_as_features <- c(glb_exclude_vars_as_features,
".clusterid")
glb_interaction_only_features["myCategory.fctr"] <- c(".clusterid.fctr")
glb_exclude_vars_as_features <- c(glb_exclude_vars_as_features,
cluster_vars)
}
# Re-partition
glb_trnobs_df <- subset(glb_allobs_df, .src == "Train")
glb_newobs_df <- subset(glb_allobs_df, .src == "Test")
glb_chunks_df <- myadd_chunk(glb_chunks_df, "select.features", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 7 cluster.data 4 0 17.753 18.033 0.28
## 8 select.features 5 0 18.034 NA NA
5.0: select featuresprint(glb_feats_df <- myselect_features(entity_df=glb_trnobs_df,
exclude_vars_as_features=glb_exclude_vars_as_features,
rsp_var=glb_rsp_var))
## id cor.y exclude.as.feat cor.y.abs
## CO2 CO2 0.78852921 0 0.78852921
## Year Year 0.78679714 0 0.78679714
## N2O N2O 0.77863893 0 0.77863893
## CH4 CH4 0.70325502 0 0.70325502
## CFC.12 CFC.12 0.68755755 0 0.68755755
## CFC.11 CFC.11 0.40771029 0 0.40771029
## Aerosols Aerosols -0.38491375 0 0.38491375
## TSI TSI 0.24338269 0 0.24338269
## MEI MEI 0.17247075 0 0.17247075
## Month Month -0.09985674 0 0.09985674
## .rnorm .rnorm -0.04112864 0 0.04112864
# sav_feats_df <- glb_feats_df; glb_feats_df <- sav_feats_df
print(glb_feats_df <- orderBy(~-cor.y,
myfind_cor_features(feats_df=glb_feats_df, obs_df=glb_trnobs_df,
rsp_var=glb_rsp_var)))
## Loading required package: reshape2
## [1] "cor(N2O, Year)=0.9938"
## [1] "cor(Temp, N2O)=0.7786"
## [1] "cor(Temp, Year)=0.7868"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified N2O as highly correlated with Year
## [1] "cor(CO2, Year)=0.9827"
## [1] "cor(Temp, CO2)=0.7885"
## [1] "cor(Temp, Year)=0.7868"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified Year as highly correlated with CO2
## [1] "cor(CFC.12, CH4)=0.9636"
## [1] "cor(Temp, CFC.12)=0.6876"
## [1] "cor(Temp, CH4)=0.7033"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified CFC.12 as highly correlated with CH4
## [1] "cor(CH4, CO2)=0.8773"
## [1] "cor(Temp, CH4)=0.7033"
## [1] "cor(Temp, CO2)=0.7885"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified CH4 as highly correlated with CO2
## id cor.y exclude.as.feat cor.y.abs cor.high.X freqRatio
## 6 CO2 0.78852921 0 0.78852921 <NA> 1.0
## 11 Year 0.78679714 0 0.78679714 CO2 1.0
## 9 N2O 0.77863893 0 0.77863893 Year 1.5
## 5 CH4 0.70325502 0 0.70325502 CO2 1.0
## 4 CFC.12 0.68755755 0 0.68755755 CH4 2.0
## 3 CFC.11 0.40771029 0 0.40771029 <NA> 2.0
## 10 TSI 0.24338269 0 0.24338269 <NA> 1.0
## 7 MEI 0.17247075 0 0.17247075 <NA> 1.0
## 1 .rnorm -0.04112864 0 0.04112864 <NA> 1.0
## 8 Month -0.09985674 0 0.09985674 <NA> 1.0
## 2 Aerosols -0.38491375 0 0.38491375 <NA> 3.6
## percentUnique zeroVar nzv myNearZV is.cor.y.abs.low
## 6 96.830986 FALSE FALSE FALSE FALSE
## 11 8.450704 FALSE FALSE FALSE FALSE
## 9 98.591549 FALSE FALSE FALSE FALSE
## 5 98.591549 FALSE FALSE FALSE FALSE
## 4 99.647887 FALSE FALSE FALSE FALSE
## 3 99.647887 FALSE FALSE FALSE FALSE
## 10 98.591549 FALSE FALSE FALSE FALSE
## 7 95.774648 FALSE FALSE FALSE FALSE
## 1 100.000000 FALSE FALSE FALSE FALSE
## 8 4.225352 FALSE FALSE FALSE FALSE
## 2 53.169014 FALSE FALSE FALSE FALSE
#subset(glb_feats_df, id %in% c("A.nuppr.log", "S.nuppr.log"))
print(myplot_scatter(glb_feats_df, "percentUnique", "freqRatio",
colorcol_name="myNearZV", jitter=TRUE) +
geom_point(aes(shape=nzv)) + xlim(-5, 25))
## Warning in myplot_scatter(glb_feats_df, "percentUnique", "freqRatio",
## colorcol_name = "myNearZV", : converting myNearZV to class:factor
## Warning in loop_apply(n, do.ply): Removed 9 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 9 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 9 rows containing missing values
## (geom_point).
print(subset(glb_feats_df, myNearZV))
## [1] id cor.y exclude.as.feat cor.y.abs
## [5] cor.high.X freqRatio percentUnique zeroVar
## [9] nzv myNearZV is.cor.y.abs.low
## <0 rows> (or 0-length row.names)
glb_allobs_df <- glb_allobs_df[, setdiff(names(glb_allobs_df),
subset(glb_feats_df, myNearZV)$id)]
if (!is.null(glb_interaction_only_features))
glb_feats_df[glb_feats_df$id %in% glb_interaction_only_features, "interaction.feat"] <-
names(glb_interaction_only_features) else
glb_feats_df$interaction.feat <- NA
mycheck_problem_data(glb_allobs_df, terminate = TRUE)
## [1] "numeric data missing in : "
## named integer(0)
## [1] "numeric data w/ 0s in : "
## named integer(0)
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## .rownames
## 0
# glb_allobs_df %>% filter(is.na(Married.fctr)) %>% tbl_df()
# glb_allobs_df %>% count(Married.fctr)
# levels(glb_allobs_df$Married.fctr)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "partition.data.training", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 8 select.features 5 0 18.034 18.85 0.817
## 9 partition.data.training 6 0 18.851 NA NA
6.0: partition data trainingif (all(is.na(glb_newobs_df[, glb_rsp_var]))) {
require(caTools)
set.seed(glb_split_sample.seed)
split <- sample.split(glb_trnobs_df[, glb_rsp_var_raw],
SplitRatio=1 - (nrow(glb_newobs_df) * 1.1 / nrow(glb_trnobs_df)))
glb_fitobs_df <- glb_trnobs_df[split, ]
glb_OOBobs_df <- glb_trnobs_df[!split ,]
} else {
print(sprintf("Newdata contains non-NA data for %s; setting OOB to Newdata",
glb_rsp_var))
glb_fitobs_df <- glb_trnobs_df; glb_OOBobs_df <- glb_newobs_df
}
## [1] "Newdata contains non-NA data for Temp; setting OOB to Newdata"
if (!is.null(glb_max_fitobs) && (nrow(glb_fitobs_df) > glb_max_fitobs)) {
warning("glb_fitobs_df restricted to glb_max_fitobs: ",
format(glb_max_fitobs, big.mark=","))
org_fitobs_df <- glb_fitobs_df
glb_fitobs_df <-
org_fitobs_df[split <- sample.split(org_fitobs_df[, glb_rsp_var_raw],
SplitRatio=glb_max_fitobs), ]
org_fitobs_df <- NULL
}
glb_allobs_df$.lcn <- ""
glb_allobs_df[glb_allobs_df[, glb_id_var] %in%
glb_fitobs_df[, glb_id_var], ".lcn"] <- "Fit"
glb_allobs_df[glb_allobs_df[, glb_id_var] %in%
glb_OOBobs_df[, glb_id_var], ".lcn"] <- "OOB"
dsp_class_dstrb <- function(obs_df, location_var, partition_var) {
xtab_df <- mycreate_xtab_df(obs_df, c(location_var, partition_var))
rownames(xtab_df) <- xtab_df[, location_var]
xtab_df <- xtab_df[, -grepl(location_var, names(xtab_df))]
print(xtab_df)
print(xtab_df / rowSums(xtab_df, na.rm=TRUE))
}
# Ensure proper splits by glb_rsp_var_raw & user-specified feature for OOB vs. new
if (!is.null(glb_category_vars)) {
if (glb_is_classification)
dsp_class_dstrb(glb_allobs_df, ".lcn", glb_rsp_var_raw)
newobs_ctgry_df <- mycreate_sqlxtab_df(subset(glb_allobs_df, .src == "Test"),
glb_category_vars)
OOBobs_ctgry_df <- mycreate_sqlxtab_df(subset(glb_allobs_df, .lcn == "OOB"),
glb_category_vars)
glb_ctgry_df <- merge(newobs_ctgry_df, OOBobs_ctgry_df, by=glb_category_vars
, all=TRUE, suffixes=c(".Tst", ".OOB"))
glb_ctgry_df$.freqRatio.Tst <- glb_ctgry_df$.n.Tst / sum(glb_ctgry_df$.n.Tst, na.rm=TRUE)
glb_ctgry_df$.freqRatio.OOB <- glb_ctgry_df$.n.OOB / sum(glb_ctgry_df$.n.OOB, na.rm=TRUE)
print(orderBy(~-.freqRatio.Tst-.freqRatio.OOB, glb_ctgry_df))
}
# Run this line by line
print("glb_feats_df:"); print(dim(glb_feats_df))
## [1] "glb_feats_df:"
## [1] 11 12
sav_feats_df <- glb_feats_df
glb_feats_df <- sav_feats_df
glb_feats_df[, "rsp_var_raw"] <- FALSE
glb_feats_df[glb_feats_df$id == glb_rsp_var_raw, "rsp_var_raw"] <- TRUE
glb_feats_df$exclude.as.feat <- (glb_feats_df$exclude.as.feat == 1)
if (!is.null(glb_id_var) && glb_id_var != ".rownames")
glb_feats_df[glb_feats_df$id %in% glb_id_var, "id_var"] <- TRUE
add_feats_df <- data.frame(id=glb_rsp_var, exclude.as.feat=TRUE, rsp_var=TRUE)
row.names(add_feats_df) <- add_feats_df$id; print(add_feats_df)
## id exclude.as.feat rsp_var
## Temp Temp TRUE TRUE
glb_feats_df <- myrbind_df(glb_feats_df, add_feats_df)
if (glb_id_var != ".rownames")
print(subset(glb_feats_df, rsp_var_raw | rsp_var | id_var)) else
print(subset(glb_feats_df, rsp_var_raw | rsp_var))
## id cor.y exclude.as.feat cor.y.abs cor.high.X freqRatio
## Temp Temp NA TRUE NA <NA> NA
## percentUnique zeroVar nzv myNearZV is.cor.y.abs.low interaction.feat
## Temp NA NA NA NA NA NA
## rsp_var_raw rsp_var
## Temp NA TRUE
print("glb_feats_df vs. glb_allobs_df: ");
## [1] "glb_feats_df vs. glb_allobs_df: "
print(setdiff(glb_feats_df$id, names(glb_allobs_df)))
## character(0)
print("glb_allobs_df vs. glb_feats_df: ");
## [1] "glb_allobs_df vs. glb_feats_df: "
# Ensure these are only chr vars
print(setdiff(setdiff(names(glb_allobs_df), glb_feats_df$id),
myfind_chr_cols_df(glb_allobs_df)))
## character(0)
#print(setdiff(setdiff(names(glb_allobs_df), glb_exclude_vars_as_features),
# glb_feats_df$id))
print("glb_allobs_df: "); print(dim(glb_allobs_df))
## [1] "glb_allobs_df: "
## [1] 308 15
print("glb_trnobs_df: "); print(dim(glb_trnobs_df))
## [1] "glb_trnobs_df: "
## [1] 284 14
print("glb_fitobs_df: "); print(dim(glb_fitobs_df))
## [1] "glb_fitobs_df: "
## [1] 284 14
print("glb_OOBobs_df: "); print(dim(glb_OOBobs_df))
## [1] "glb_OOBobs_df: "
## [1] 24 14
print("glb_newobs_df: "); print(dim(glb_newobs_df))
## [1] "glb_newobs_df: "
## [1] 24 14
# # Does not handle NULL or length(glb_id_var) > 1
# glb_allobs_df$.src.trn <- 0
# glb_allobs_df[glb_allobs_df[, glb_id_var] %in% glb_trnobs_df[, glb_id_var],
# ".src.trn"] <- 1
# glb_allobs_df$.src.fit <- 0
# glb_allobs_df[glb_allobs_df[, glb_id_var] %in% glb_fitobs_df[, glb_id_var],
# ".src.fit"] <- 1
# glb_allobs_df$.src.OOB <- 0
# glb_allobs_df[glb_allobs_df[, glb_id_var] %in% glb_OOBobs_df[, glb_id_var],
# ".src.OOB"] <- 1
# glb_allobs_df$.src.new <- 0
# glb_allobs_df[glb_allobs_df[, glb_id_var] %in% glb_newobs_df[, glb_id_var],
# ".src.new"] <- 1
# #print(unique(glb_allobs_df[, ".src.trn"]))
# write_cols <- c(glb_feats_df$id,
# ".src.trn", ".src.fit", ".src.OOB", ".src.new")
# glb_allobs_df <- glb_allobs_df[, write_cols]
#
# tmp_feats_df <- glb_feats_df
# tmp_entity_df <- glb_allobs_df
if (glb_save_envir)
save(glb_feats_df,
glb_allobs_df, #glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
file=paste0(glb_out_pfx, "blddfs_dsk.RData"))
# load(paste0(glb_out_pfx, "blddfs_dsk.RData"))
# if (!all.equal(tmp_feats_df, glb_feats_df))
# stop("glb_feats_df r/w not working")
# if (!all.equal(tmp_entity_df, glb_allobs_df))
# stop("glb_allobs_df r/w not working")
rm(split)
## Warning in rm(split): object 'split' not found
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 9 partition.data.training 6 0 18.851 19.13 0.279
## 10 fit.models 7 0 19.130 NA NA
7.0: fit models# load(paste0(glb_out_pfx, "dsk.RData"))
# keep_cols <- setdiff(names(glb_allobs_df),
# grep("^.src", names(glb_allobs_df), value=TRUE))
# glb_trnobs_df <- glb_allobs_df[glb_allobs_df$.src.trn == 1, keep_cols]
# glb_fitobs_df <- glb_allobs_df[glb_allobs_df$.src.fit == 1, keep_cols]
# glb_OOBobs_df <- glb_allobs_df[glb_allobs_df$.src.OOB == 1, keep_cols]
# glb_newobs_df <- glb_allobs_df[glb_allobs_df$.src.new == 1, keep_cols]
#
# glb_models_lst <- list(); glb_models_df <- data.frame()
#
if (glb_is_classification && glb_is_binomial &&
(length(unique(glb_fitobs_df[, glb_rsp_var])) < 2))
stop("glb_fitobs_df$", glb_rsp_var, ": contains less than 2 unique values: ",
paste0(unique(glb_fitobs_df[, glb_rsp_var]), collapse=", "))
max_cor_y_x_vars <- orderBy(~ -cor.y.abs,
subset(glb_feats_df, (exclude.as.feat == 0) & !is.cor.y.abs.low &
is.na(cor.high.X)))[1:2, "id"]
# while(length(max_cor_y_x_vars) < 2) {
# max_cor_y_x_vars <- c(max_cor_y_x_vars, orderBy(~ -cor.y.abs,
# subset(glb_feats_df, (exclude.as.feat == 0) & !is.cor.y.abs.low))[3, "id"])
# }
if (!is.null(glb_Baseline_mdl_var)) {
if ((max_cor_y_x_vars[1] != glb_Baseline_mdl_var) &
(glb_feats_df[max_cor_y_x_vars[1], "cor.y.abs"] >
glb_feats_df[glb_Baseline_mdl_var, "cor.y.abs"]))
stop(max_cor_y_x_vars[1], " has a lower correlation with ", glb_rsp_var,
" than the Baseline var: ", glb_Baseline_mdl_var)
}
glb_model_type <- ifelse(glb_is_regression, "regression", "classification")
# Baseline
if (!is.null(glb_Baseline_mdl_var))
ret_lst <- myfit_mdl_fn(model_id="Baseline", model_method="mybaseln_classfr",
indep_vars_vctr=glb_Baseline_mdl_var,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
# Most Frequent Outcome "MFO" model: mean(y) for regression
# Not using caret's nullModel since model stats not avl
# Cannot use rpart for multinomial classification since it predicts non-MFO
ret_lst <- myfit_mdl(model_id="MFO",
model_method=ifelse(glb_is_regression, "lm", "myMFO_classfr"),
model_type=glb_model_type,
indep_vars_vctr=".rnorm",
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
## [1] "fitting model: MFO.lm"
## [1] " indep_vars: .rnorm"
## Fitting parameter = none on full training set
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52335 -0.13207 -0.01296 0.15881 0.49022
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.247927 0.010760 23.042 <2e-16 ***
## .rnorm -0.008127 0.011757 -0.691 0.49
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1813 on 282 degrees of freedom
## Multiple R-squared: 0.001692, Adjusted R-squared: -0.001849
## F-statistic: 0.4778 on 1 and 282 DF, p-value: 0.49
##
## [1] " calling mypredict_mdl for fit:"
## [1] " calling mypredict_mdl for OOB:"
## model_id model_method feats max.nTuningRuns min.elapsedtime.everything
## 1 MFO.lm lm .rnorm 0 0.458
## min.elapsedtime.final max.R.sq.fit min.RMSE.fit max.R.sq.OOB
## 1 0.003 0.001691565 0.1806634 -0.01939772
## min.RMSE.OOB max.Adj.R.sq.fit
## 1 0.1577691 -0.001848536
if (glb_is_classification)
# "random" model - only for classification;
# none needed for regression since it is same as MFO
ret_lst <- myfit_mdl(model_id="Random", model_method="myrandom_classfr",
model_type=glb_model_type,
indep_vars_vctr=".rnorm",
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
# Any models that have tuning parameters has "better" results with cross-validation
# (except rf) & "different" results for different outcome metrics
# Max.cor.Y
# Check impact of cv
# rpart is not a good candidate since caret does not optimize cp (only tuning parameter of rpart) well
ret_lst <- myfit_mdl(model_id="Max.cor.Y.cv.0",
model_method="rpart",
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_vars,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
## [1] "fitting model: Max.cor.Y.cv.0.rpart"
## [1] " indep_vars: CO2, CFC.11"
## Loading required package: rpart
## Fitting cp = 0.546 on full training set
## Loading required package: rpart.plot
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 284
##
## CP nsplit rel error
## 1 0.5456846 0 1
##
## Node number 1: 284 observations
## mean=0.2477993, MSE=0.03269456
##
## n= 284
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 284 9.285256 0.2477993 *
## [1] " calling mypredict_mdl for fit:"
## [1] " calling mypredict_mdl for OOB:"
## model_id model_method feats max.nTuningRuns
## 1 Max.cor.Y.cv.0.rpart rpart CO2, CFC.11 0
## min.elapsedtime.everything min.elapsedtime.final max.R.sq.fit
## 1 0.785 0.012 0
## min.RMSE.fit max.R.sq.OOB min.RMSE.OOB
## 1 0.1808164 0 0.1562608
ret_lst <- myfit_mdl(model_id="Max.cor.Y.cv.0.cp.0",
model_method="rpart",
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_vars,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=0,
tune_models_df=data.frame(parameter="cp", min=0.0, max=0.0, by=0.1))
## [1] "fitting model: Max.cor.Y.cv.0.cp.0.rpart"
## [1] " indep_vars: CO2, CFC.11"
## Fitting cp = 0 on full training set
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 284
##
## CP nsplit rel error
## 1 0.545684590 0 1.0000000
## 2 0.089243627 1 0.4543154
## 3 0.034193533 2 0.3650718
## 4 0.019440628 5 0.2624912
## 5 0.015370725 6 0.2430506
## 6 0.014901246 7 0.2276798
## 7 0.014777393 8 0.2127786
## 8 0.012131745 9 0.1980012
## 9 0.006402599 10 0.1858694
## 10 0.003816922 13 0.1666617
## 11 0.003150164 14 0.1628447
## 12 0.003088947 15 0.1596946
## 13 0.002574496 16 0.1566056
## 14 0.002349241 17 0.1540311
## 15 0.002099649 18 0.1516819
## 16 0.001886464 22 0.1432833
## 17 0.001540140 23 0.1413968
## 18 0.001368570 24 0.1398567
## 19 0.000000000 25 0.1384881
##
## Variable importance
## CO2 CFC.11
## 68 32
##
## Node number 1: 284 observations, complexity param=0.5456846
## mean=0.2477993, MSE=0.03269456
## left son=2 (168 obs) right son=3 (116 obs)
## Primary splits:
## CO2 < 364.25 to the left, improve=0.5456846, (0 missing)
## CFC.11 < 234.288 to the left, improve=0.3052006, (0 missing)
## Surrogate splits:
## CFC.11 < 248.3335 to the left, agree=0.634, adj=0.103, (0 split)
##
## Node number 2: 168 observations, complexity param=0.08924363
## mean=0.1368095, MSE=0.01579275
## left son=4 (44 obs) right son=5 (124 obs)
## Primary splits:
## CFC.11 < 229.452 to the left, improve=0.3123231, (0 missing)
## CO2 < 355.61 to the left, improve=0.2438382, (0 missing)
## Surrogate splits:
## CO2 < 348.29 to the left, agree=0.946, adj=0.795, (0 split)
##
## Node number 3: 116 observations, complexity param=0.03419353
## mean=0.4085431, MSE=0.01349356
## left son=6 (46 obs) right son=7 (70 obs)
## Primary splits:
## CFC.11 < 261.795 to the right, improve=0.1851078, (0 missing)
## CO2 < 371.765 to the left, improve=0.1764773, (0 missing)
## Surrogate splits:
## CO2 < 371.825 to the left, agree=0.931, adj=0.826, (0 split)
##
## Node number 4: 44 observations, complexity param=0.01537072
## mean=0.01890909, MSE=0.007631264
## left son=8 (35 obs) right son=9 (9 obs)
## Primary splits:
## CFC.11 < 197.489 to the right, improve=0.4250490, (0 missing)
## CO2 < 344.275 to the right, improve=0.1042342, (0 missing)
## Surrogate splits:
## CO2 < 340.825 to the right, agree=0.841, adj=0.222, (0 split)
##
## Node number 5: 124 observations, complexity param=0.01944063
## mean=0.1786452, MSE=0.0120061
## left son=10 (97 obs) right son=11 (27 obs)
## Primary splits:
## CO2 < 359.735 to the left, improve=0.1212497, (0 missing)
## CFC.11 < 270.4475 to the right, improve=0.1078429, (0 missing)
##
## Node number 6: 46 observations, complexity param=0.03419353
## mean=0.3468913, MSE=0.02144514
## left son=12 (24 obs) right son=13 (22 obs)
## Primary splits:
## CFC.11 < 265.5055 to the left, improve=0.2356230, (0 missing)
## CO2 < 369.55 to the right, improve=0.0521811, (0 missing)
## Surrogate splits:
## CO2 < 369.115 to the right, agree=0.783, adj=0.545, (0 split)
##
## Node number 7: 70 observations, complexity param=0.002099649
## mean=0.4490571, MSE=0.004129082
## left son=14 (10 obs) right son=15 (60 obs)
## Primary splits:
## CO2 < 372.325 to the left, improve=0.05074610, (0 missing)
## CFC.11 < 250.9935 to the left, improve=0.04516619, (0 missing)
## Surrogate splits:
## CFC.11 < 260.2975 to the right, agree=0.871, adj=0.1, (0 split)
##
## Node number 8: 35 observations, complexity param=0.003816922
## mean=-0.009971429, MSE=0.004965685
## left son=16 (23 obs) right son=17 (12 obs)
## Primary splits:
## CFC.11 < 217.854 to the left, improve=0.2039201, (0 missing)
## CO2 < 344.655 to the left, improve=0.1270717, (0 missing)
## Surrogate splits:
## CO2 < 349.345 to the left, agree=0.743, adj=0.25, (0 split)
##
## Node number 9: 9 observations
## mean=0.1312222, MSE=0.002139506
##
## Node number 10: 97 observations, complexity param=0.01490125
## mean=0.1585155, MSE=0.009596786
## left son=20 (21 obs) right son=21 (76 obs)
## Primary splits:
## CFC.11 < 270.174 to the right, improve=0.14863430, (0 missing)
## CO2 < 355.64 to the left, improve=0.06312448, (0 missing)
##
## Node number 11: 27 observations, complexity param=0.01213175
## mean=0.250963, MSE=0.01397618
## left son=22 (20 obs) right son=23 (7 obs)
## Primary splits:
## CFC.11 < 268.694 to the right, improve=0.2985141, (0 missing)
## CO2 < 360.795 to the right, improve=0.1178786, (0 missing)
## Surrogate splits:
## CO2 < 363.89 to the left, agree=0.778, adj=0.143, (0 split)
##
## Node number 12: 24 observations, complexity param=0.00136857
## mean=0.2788333, MSE=0.003799306
## left son=24 (16 obs) right son=25 (8 obs)
## Primary splits:
## CO2 < 370.425 to the left, improve=0.13936230, (0 missing)
## CFC.11 < 262.8615 to the left, improve=0.06668407, (0 missing)
## Surrogate splits:
## CFC.11 < 262.2815 to the right, agree=0.75, adj=0.25, (0 split)
##
## Node number 13: 22 observations, complexity param=0.03419353
## mean=0.4211364, MSE=0.03012985
## left son=26 (8 obs) right son=27 (14 obs)
## Primary splits:
## CFC.11 < 267.576 to the right, improve=0.6491751, (0 missing)
## CO2 < 365.69 to the left, improve=0.2845832, (0 missing)
## Surrogate splits:
## CO2 < 365.17 to the left, agree=0.773, adj=0.375, (0 split)
##
## Node number 14: 10 observations
## mean=0.4136, MSE=0.00270884
##
## Node number 15: 60 observations, complexity param=0.002099649
## mean=0.4549667, MSE=0.004121332
## left son=30 (13 obs) right son=31 (47 obs)
## Primary splits:
## CFC.11 < 250.9935 to the left, improve=0.07990522, (0 missing)
## CO2 < 373.54 to the right, improve=0.07660076, (0 missing)
## Surrogate splits:
## CO2 < 381.245 to the right, agree=0.867, adj=0.385, (0 split)
##
## Node number 16: 23 observations, complexity param=0.003150164
## mean=-0.03295652, MSE=0.005293781
## left son=32 (15 obs) right son=33 (8 obs)
## Primary splits:
## CFC.11 < 203.6895 to the right, improve=0.2402333, (0 missing)
## CO2 < 344.655 to the left, improve=0.1010386, (0 missing)
##
## Node number 17: 12 observations
## mean=0.03408333, MSE=0.00138341
##
## Node number 20: 21 observations, complexity param=0.01477739
## mean=0.08666667, MSE=0.01161756
## left son=40 (8 obs) right son=41 (13 obs)
## Primary splits:
## CO2 < 355.58 to the left, improve=0.5624159, (0 missing)
## CFC.11 < 270.91 to the left, improve=0.1800411, (0 missing)
## Surrogate splits:
## CFC.11 < 270.786 to the left, agree=0.762, adj=0.375, (0 split)
##
## Node number 21: 76 observations, complexity param=0.006402599
## mean=0.1783684, MSE=0.007217864
## left son=42 (35 obs) right son=43 (41 obs)
## Primary splits:
## CFC.11 < 259.101 to the left, improve=0.07508841, (0 missing)
## CO2 < 355.62 to the left, improve=0.06097828, (0 missing)
## Surrogate splits:
## CO2 < 352.27 to the left, agree=0.829, adj=0.629, (0 split)
##
## Node number 22: 20 observations, complexity param=0.003088947
## mean=0.21275, MSE=0.006759088
## left son=44 (7 obs) right son=45 (13 obs)
## Primary splits:
## CFC.11 < 269.6075 to the left, improve=0.2121711, (0 missing)
## CO2 < 361.905 to the right, improve=0.1144595, (0 missing)
## Surrogate splits:
## CO2 < 361.905 to the right, agree=0.8, adj=0.429, (0 split)
##
## Node number 23: 7 observations
## mean=0.3601429, MSE=0.01850412
##
## Node number 24: 16 observations
## mean=0.2625625, MSE=0.003108996
##
## Node number 25: 8 observations
## mean=0.311375, MSE=0.003591484
##
## Node number 26: 8 observations
## mean=0.236125, MSE=0.008326609
##
## Node number 27: 14 observations
## mean=0.5268571, MSE=0.01185241
##
## Node number 30: 13 observations
## mean=0.4204615, MSE=0.003037172
##
## Node number 31: 47 observations, complexity param=0.002099649
## mean=0.4645106, MSE=0.004000803
## left son=62 (38 obs) right son=63 (9 obs)
## Primary splits:
## CFC.11 < 252.375 to the right, improve=0.07930218, (0 missing)
## CO2 < 373.54 to the right, improve=0.06771785, (0 missing)
## Surrogate splits:
## CO2 < 380.705 to the left, agree=0.915, adj=0.556, (0 split)
##
## Node number 32: 15 observations
## mean=-0.059, MSE=0.005650267
##
## Node number 33: 8 observations
## mean=0.015875, MSE=0.0009691094
##
## Node number 40: 8 observations
## mean=-0.016375, MSE=0.003188984
##
## Node number 41: 13 observations
## mean=0.1500769, MSE=0.006249609
##
## Node number 42: 35 observations, complexity param=0.006402599
## mean=0.1531714, MSE=0.005849456
## left son=84 (14 obs) right son=85 (21 obs)
## Primary splits:
## CFC.11 < 249.9775 to the right, improve=0.3344520, (0 missing)
## CO2 < 349.45 to the right, improve=0.1261017, (0 missing)
## Surrogate splits:
## CO2 < 352.735 to the right, agree=0.714, adj=0.286, (0 split)
##
## Node number 43: 41 observations, complexity param=0.006402599
## mean=0.199878, MSE=0.007381375
## left son=86 (18 obs) right son=87 (23 obs)
## Primary splits:
## CFC.11 < 267.5405 to the right, improve=0.22695980, (0 missing)
## CO2 < 359.1 to the right, improve=0.04531423, (0 missing)
## Surrogate splits:
## CO2 < 356.585 to the right, agree=0.78, adj=0.5, (0 split)
##
## Node number 44: 7 observations
## mean=0.1611429, MSE=0.002477265
##
## Node number 45: 13 observations
## mean=0.2405385, MSE=0.006858402
##
## Node number 62: 38 observations, complexity param=0.002099649
## mean=0.4558421, MSE=0.004474607
## left son=124 (10 obs) right son=125 (28 obs)
## Primary splits:
## CFC.11 < 254.2805 to the left, improve=0.1684647, (0 missing)
## CO2 < 373.54 to the right, improve=0.1163419, (0 missing)
## Surrogate splits:
## CO2 < 379.145 to the right, agree=0.789, adj=0.2, (0 split)
##
## Node number 63: 9 observations
## mean=0.5011111, MSE=0.0003434321
##
## Node number 84: 14 observations
## mean=0.099, MSE=0.001928429
##
## Node number 85: 21 observations, complexity param=0.002574496
## mean=0.1892857, MSE=0.005202871
## left son=170 (7 obs) right son=171 (14 obs)
## Primary splits:
## CFC.11 < 235.2835 to the left, improve=0.21878820, (0 missing)
## CO2 < 350.52 to the right, improve=0.05970858, (0 missing)
##
## Node number 86: 18 observations
## mean=0.1536111, MSE=0.008137904
##
## Node number 87: 23 observations, complexity param=0.002349241
## mean=0.236087, MSE=0.003802949
## left son=174 (13 obs) right son=175 (10 obs)
## Primary splits:
## CO2 < 355.27 to the left, improve=0.24938660, (0 missing)
## CFC.11 < 261.366 to the right, improve=0.05169855, (0 missing)
## Surrogate splits:
## CFC.11 < 265.7465 to the left, agree=0.696, adj=0.3, (0 split)
##
## Node number 124: 10 observations
## mean=0.4099, MSE=0.00335629
##
## Node number 125: 28 observations, complexity param=0.001886464
## mean=0.47225, MSE=0.003850973
## left son=250 (21 obs) right son=251 (7 obs)
## Primary splits:
## CFC.11 < 255.66 to the right, improve=0.16244780, (0 missing)
## CO2 < 373.54 to the right, improve=0.08647264, (0 missing)
## Surrogate splits:
## CO2 < 378.615 to the left, agree=0.821, adj=0.286, (0 split)
##
## Node number 170: 7 observations
## mean=0.1415714, MSE=0.006245959
##
## Node number 171: 14 observations
## mean=0.2131429, MSE=0.002973837
##
## Node number 174: 13 observations
## mean=0.2090769, MSE=0.001523148
##
## Node number 175: 10 observations
## mean=0.2712, MSE=0.00458536
##
## Node number 250: 21 observations, complexity param=0.00154014
## mean=0.4578095, MSE=0.00406063
## left son=500 (14 obs) right son=501 (7 obs)
## Primary splits:
## CFC.11 < 259.791 to the left, improve=0.16770320, (0 missing)
## CO2 < 374.935 to the right, improve=0.09516095, (0 missing)
## Surrogate splits:
## CO2 < 373.965 to the right, agree=0.952, adj=0.857, (0 split)
##
## Node number 251: 7 observations
## mean=0.5155714, MSE=0.0007196735
##
## Node number 500: 14 observations
## mean=0.4393571, MSE=0.001938658
##
## Node number 501: 7 observations
## mean=0.4947143, MSE=0.006261633
##
## n= 284
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 284 9.285256000 0.247799300
## 2) CO2< 364.25 168 2.653182000 0.136809500
## 4) CFC.11< 229.452 44 0.335775600 0.018909090
## 8) CFC.11>=197.489 35 0.173799000 -0.009971429
## 16) CFC.11< 217.854 23 0.121757000 -0.032956520
## 32) CFC.11>=203.6895 15 0.084754000 -0.059000000 *
## 33) CFC.11< 203.6895 8 0.007752875 0.015875000 *
## 17) CFC.11>=217.854 12 0.016600920 0.034083330 *
## 9) CFC.11< 197.489 9 0.019255560 0.131222200 *
## 5) CFC.11>=229.452 124 1.488756000 0.178645200
## 10) CO2< 359.735 97 0.930888200 0.158515500
## 20) CFC.11>=270.174 21 0.243968700 0.086666670
## 40) CO2< 355.58 8 0.025511880 -0.016375000 *
## 41) CO2>=355.58 13 0.081244920 0.150076900 *
## 21) CFC.11< 270.174 76 0.548557700 0.178368400
## 42) CFC.11< 259.101 35 0.204731000 0.153171400
## 84) CFC.11>=249.9775 14 0.026998000 0.099000000 *
## 85) CFC.11< 249.9775 21 0.109260300 0.189285700
## 170) CFC.11< 235.2835 7 0.043721710 0.141571400 *
## 171) CFC.11>=235.2835 14 0.041633710 0.213142900 *
## 43) CFC.11>=259.101 41 0.302636400 0.199878000
## 86) CFC.11>=267.5405 18 0.146482300 0.153611100 *
## 87) CFC.11< 267.5405 23 0.087467830 0.236087000
## 174) CO2< 355.27 13 0.019800920 0.209076900 *
## 175) CO2>=355.27 10 0.045853600 0.271200000 *
## 11) CO2>=359.735 27 0.377357000 0.250963000
## 22) CFC.11>=268.694 20 0.135181800 0.212750000
## 44) CFC.11< 269.6075 7 0.017340860 0.161142900 *
## 45) CFC.11>=269.6075 13 0.089159230 0.240538500 *
## 23) CFC.11< 268.694 7 0.129528900 0.360142900 *
## 3) CO2>=364.25 116 1.565253000 0.408543100
## 6) CFC.11>=261.795 46 0.986476500 0.346891300
## 12) CFC.11< 265.5055 24 0.091183330 0.278833300
## 24) CO2< 370.425 16 0.049743940 0.262562500 *
## 25) CO2>=370.425 8 0.028731880 0.311375000 *
## 13) CFC.11>=265.5055 22 0.662856600 0.421136400
## 26) CFC.11>=267.576 8 0.066612870 0.236125000 *
## 27) CFC.11< 267.576 14 0.165933700 0.526857100 *
## 7) CFC.11< 261.795 70 0.289035800 0.449057100
## 14) CO2< 372.325 10 0.027088400 0.413600000 *
## 15) CO2>=372.325 60 0.247279900 0.454966700
## 30) CFC.11< 250.9935 13 0.039483230 0.420461500 *
## 31) CFC.11>=250.9935 47 0.188037700 0.464510600
## 62) CFC.11>=252.375 38 0.170035100 0.455842100
## 124) CFC.11< 254.2805 10 0.033562900 0.409900000 *
## 125) CFC.11>=254.2805 28 0.107827300 0.472250000
## 250) CFC.11>=255.66 21 0.085273240 0.457809500
## 500) CFC.11< 259.791 14 0.027141210 0.439357100 *
## 501) CFC.11>=259.791 7 0.043831430 0.494714300 *
## 251) CFC.11< 255.66 7 0.005037714 0.515571400 *
## 63) CFC.11< 252.375 9 0.003090889 0.501111100 *
## [1] " calling mypredict_mdl for fit:"
## [1] " calling mypredict_mdl for OOB:"
## model_id model_method feats max.nTuningRuns
## 1 Max.cor.Y.cv.0.cp.0.rpart rpart CO2, CFC.11 0
## min.elapsedtime.everything min.elapsedtime.final max.R.sq.fit
## 1 0.446 0.009 0.8615119
## min.RMSE.fit max.R.sq.OOB min.RMSE.OOB
## 1 0.06728899 0.4082889 0.1202002
if (glb_is_regression || glb_is_binomial) # For multinomials this model will be run next by default
ret_lst <- myfit_mdl(model_id="Max.cor.Y",
model_method="rpart",
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_vars,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
## [1] "fitting model: Max.cor.Y.rpart"
## [1] " indep_vars: CO2, CFC.11"
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.0342 on full training set
## Warning in myfit_mdl(model_id = "Max.cor.Y", model_method = "rpart",
## model_type = glb_model_type, : model's bestTune found at an extreme of
## tuneGrid for parameter: cp
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 284
##
## CP nsplit rel error
## 1 0.54568459 0 1.0000000
## 2 0.08924363 1 0.4543154
## 3 0.03419353 2 0.3650718
##
## Variable importance
## CO2 CFC.11
## 81 19
##
## Node number 1: 284 observations, complexity param=0.5456846
## mean=0.2477993, MSE=0.03269456
## left son=2 (168 obs) right son=3 (116 obs)
## Primary splits:
## CO2 < 364.25 to the left, improve=0.5456846, (0 missing)
## CFC.11 < 234.288 to the left, improve=0.3052006, (0 missing)
## Surrogate splits:
## CFC.11 < 248.3335 to the left, agree=0.634, adj=0.103, (0 split)
##
## Node number 2: 168 observations, complexity param=0.08924363
## mean=0.1368095, MSE=0.01579275
## left son=4 (44 obs) right son=5 (124 obs)
## Primary splits:
## CFC.11 < 229.452 to the left, improve=0.3123231, (0 missing)
## CO2 < 355.61 to the left, improve=0.2438382, (0 missing)
## Surrogate splits:
## CO2 < 348.29 to the left, agree=0.946, adj=0.795, (0 split)
##
## Node number 3: 116 observations
## mean=0.4085431, MSE=0.01349356
##
## Node number 4: 44 observations
## mean=0.01890909, MSE=0.007631264
##
## Node number 5: 124 observations
## mean=0.1786452, MSE=0.0120061
##
## n= 284
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 284 9.2852560 0.24779930
## 2) CO2< 364.25 168 2.6531820 0.13680950
## 4) CFC.11< 229.452 44 0.3357756 0.01890909 *
## 5) CFC.11>=229.452 124 1.4887560 0.17864520 *
## 3) CO2>=364.25 116 1.5652530 0.40854310 *
## [1] " calling mypredict_mdl for fit:"
## [1] " calling mypredict_mdl for OOB:"
## model_id model_method feats max.nTuningRuns
## 1 Max.cor.Y.rpart rpart CO2, CFC.11 3
## min.elapsedtime.everything min.elapsedtime.final max.R.sq.fit
## 1 0.968 0.011 0.6349282
## min.RMSE.fit max.R.sq.OOB min.RMSE.OOB max.Rsquared.fit min.RMSESD.fit
## 1 0.1242177 0.4585666 0.1149801 0.5396945 0.009132685
## max.RsquaredSD.fit
## 1 0.06111794
# Used to compare vs. Interactions.High.cor.Y and/or Max.cor.Y.TmSrs
ret_lst <- myfit_mdl(model_id="Max.cor.Y",
model_method=ifelse(glb_is_regression, "lm",
ifelse(glb_is_binomial, "glm", "rpart")),
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_vars,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
## [1] "fitting model: Max.cor.Y.lm"
## [1] " indep_vars: CO2, CFC.11"
## Aggregating results
## Fitting final model on full training set
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31831 -0.07832 -0.00132 0.05819 0.43391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.262e+00 2.126e-01 -20.051 <2e-16 ***
## CO2 1.246e-02 6.772e-04 18.397 <2e-16 ***
## CFC.11 2.767e-05 3.691e-04 0.075 0.94
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1118 on 281 degrees of freedom
## Multiple R-squared: 0.6218, Adjusted R-squared: 0.6191
## F-statistic: 231 on 2 and 281 DF, p-value: < 2.2e-16
##
## [1] " calling mypredict_mdl for fit:"
## [1] " calling mypredict_mdl for OOB:"
## model_id model_method feats max.nTuningRuns
## 1 Max.cor.Y.lm lm CO2, CFC.11 1
## min.elapsedtime.everything min.elapsedtime.final max.R.sq.fit
## 1 0.831 0.002 0.6217859
## min.RMSE.fit max.R.sq.OOB min.RMSE.OOB max.Adj.R.sq.fit max.Rsquared.fit
## 1 0.112763 -0.780365 0.2084993 0.619094 0.618948
## min.RMSESD.fit max.RsquaredSD.fit
## 1 0.01011687 0.06160671
if (!is.null(glb_date_vars) &&
(sum(grepl(paste(glb_date_vars, "\\.day\\.minutes\\.poly\\.", sep=""),
names(glb_allobs_df))) > 0)) {
# ret_lst <- myfit_mdl(model_id="Max.cor.Y.TmSrs.poly1",
# model_method=ifelse(glb_is_regression, "lm",
# ifelse(glb_is_binomial, "glm", "rpart")),
# model_type=glb_model_type,
# indep_vars_vctr=c(max_cor_y_x_vars, paste0(glb_date_vars, ".day.minutes")),
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
# n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
#
ret_lst <- myfit_mdl(model_id="Max.cor.Y.TmSrs.poly",
model_method=ifelse(glb_is_regression, "lm",
ifelse(glb_is_binomial, "glm", "rpart")),
model_type=glb_model_type,
indep_vars_vctr=c(max_cor_y_x_vars,
grep(paste(glb_date_vars, "\\.day\\.minutes\\.poly\\.", sep=""),
names(glb_allobs_df), value=TRUE)),
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
}
# Interactions.High.cor.Y
if (length(int_feats <- setdiff(unique(glb_feats_df$cor.high.X), NA)) > 0) {
# lm & glm handle interaction terms; rpart & rf do not
if (glb_is_regression || glb_is_binomial) {
indep_vars_vctr <-
c(max_cor_y_x_vars, paste(max_cor_y_x_vars[1], int_feats, sep=":"))
} else { indep_vars_vctr <- union(max_cor_y_x_vars, int_feats) }
ret_lst <- myfit_mdl(model_id="Interact.High.cor.Y",
model_method=ifelse(glb_is_regression, "lm",
ifelse(glb_is_binomial, "glm", "rpart")),
model_type=glb_model_type,
indep_vars_vctr,
glb_rsp_var, glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
}
## [1] "fitting model: Interact.High.cor.Y.lm"
## [1] " indep_vars: CO2, CFC.11, CO2:CO2, CO2:Year, CO2:CH4"
## Aggregating results
## Fitting final model on full training set
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32044 -0.07342 -0.00413 0.05984 0.43166
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.186e+00 1.216e+00 -1.797 0.0734 .
## CO2 -4.535e-02 3.794e-02 -1.195 0.2330
## CFC.11 -2.123e-04 5.998e-04 -0.354 0.7237
## `CO2:Year` 2.600e-05 1.813e-05 1.434 0.1528
## `CO2:CH4` 2.100e-07 1.600e-06 0.131 0.8957
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1116 on 279 degrees of freedom
## Multiple R-squared: 0.6258, Adjusted R-squared: 0.6205
## F-statistic: 116.7 on 4 and 279 DF, p-value: < 2.2e-16
##
## [1] " calling mypredict_mdl for fit:"
## [1] " calling mypredict_mdl for OOB:"
## model_id model_method
## 1 Interact.High.cor.Y.lm lm
## feats max.nTuningRuns
## 1 CO2, CFC.11, CO2:CO2, CO2:Year, CO2:CH4 1
## min.elapsedtime.everything min.elapsedtime.final max.R.sq.fit
## 1 0.825 0.003 0.6258481
## min.RMSE.fit max.R.sq.OOB min.RMSE.OOB max.Adj.R.sq.fit max.Rsquared.fit
## 1 0.1145124 -0.7923166 0.209198 0.6204839 0.6072395
## min.RMSESD.fit max.RsquaredSD.fit
## 1 0.009471343 0.06006856
# Low.cor.X
# if (glb_is_classification && glb_is_binomial)
# indep_vars_vctr <- subset(glb_feats_df, is.na(cor.high.X) &
# is.ConditionalX.y &
# (exclude.as.feat != 1))[, "id"] else
indep_vars_vctr <- subset(glb_feats_df, is.na(cor.high.X) & !myNearZV &
(exclude.as.feat != 1))[, "id"]
myadjust_interaction_feats <- function(vars_vctr) {
for (feat in subset(glb_feats_df, !is.na(interaction.feat))$id)
if (feat %in% vars_vctr)
vars_vctr <- union(setdiff(vars_vctr, feat),
paste0(glb_feats_df[glb_feats_df$id == feat, "interaction.feat"], ":", feat))
return(vars_vctr)
}
indep_vars_vctr <- myadjust_interaction_feats(indep_vars_vctr)
ret_lst <- myfit_mdl(model_id="Low.cor.X",
model_method=ifelse(glb_is_regression, "lm",
ifelse(glb_is_binomial, "glm", "rpart")),
indep_vars_vctr=indep_vars_vctr,
model_type=glb_model_type,
glb_rsp_var, glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
## [1] "fitting model: Low.cor.X.lm"
## [1] " indep_vars: CO2, CFC.11, TSI, MEI, .rnorm, Month, Aerosols"
## Aggregating results
## Fitting final model on full training set
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.26073 -0.06404 -0.00721 0.06001 0.34778
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.222e+02 2.045e+01 -5.975 7.07e-09 ***
## CO2 1.092e-02 6.297e-04 17.340 < 2e-16 ***
## CFC.11 -2.951e-04 3.251e-04 -0.908 0.365
## TSI 8.679e-02 1.499e-02 5.788 1.93e-08 ***
## MEI 6.267e-02 6.622e-03 9.464 < 2e-16 ***
## .rnorm -3.218e-03 6.175e-03 -0.521 0.603
## Month -8.640e-04 1.641e-03 -0.526 0.599
## Aerosols -1.577e+00 2.200e-01 -7.171 6.86e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09442 on 276 degrees of freedom
## Multiple R-squared: 0.735, Adjusted R-squared: 0.7283
## F-statistic: 109.3 on 7 and 276 DF, p-value: < 2.2e-16
##
## [1] " calling mypredict_mdl for fit:"
## [1] " calling mypredict_mdl for OOB:"
## model_id model_method feats
## 1 Low.cor.X.lm lm CO2, CFC.11, TSI, MEI, .rnorm, Month, Aerosols
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 0.852 0.004
## max.R.sq.fit min.RMSE.fit max.R.sq.OOB min.RMSE.OOB max.Adj.R.sq.fit
## 1 0.7349783 0.09714225 0.3714425 0.1238861 0.7282567
## max.Rsquared.fit min.RMSESD.fit max.RsquaredSD.fit
## 1 0.714524 0.009508754 0.05451909
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 10 fit.models 7 0 19.130 34.274 15.144
## 11 fit.models 7 1 34.275 NA NA
fit.models_1_chunk_df <- myadd_chunk(NULL, "fit.models_1_bgn")
## label step_major step_minor bgn end elapsed
## 1 fit.models_1_bgn 1 0 36.466 NA NA
# Options:
# 1. rpart & rf manual tuning
# 2. rf without pca (default: with pca)
#stop(here); sav_models_lst <- glb_models_lst; sav_models_df <- glb_models_df
#glb_models_lst <- sav_models_lst; glb_models_df <- sav_models_df
# All X that is not user excluded
# if (glb_is_classification && glb_is_binomial) {
# model_id_pfx <- "Conditional.X"
# # indep_vars_vctr <- setdiff(names(glb_fitobs_df), union(glb_rsp_var, glb_exclude_vars_as_features))
# indep_vars_vctr <- subset(glb_feats_df, is.ConditionalX.y &
# (exclude.as.feat != 1))[, "id"]
# } else {
model_id_pfx <- "All.X"
indep_vars_vctr <- subset(glb_feats_df, !myNearZV &
(exclude.as.feat != 1))[, "id"]
# }
indep_vars_vctr <- myadjust_interaction_feats(indep_vars_vctr)
for (method in glb_models_method_vctr) {
fit.models_1_chunk_df <- myadd_chunk(fit.models_1_chunk_df,
paste0("fit.models_1_", method), major.inc=TRUE)
if (method %in% c("rpart", "rf")) {
# rpart: fubar's the tree
# rf: skip the scenario w/ .rnorm for speed
indep_vars_vctr <- setdiff(indep_vars_vctr, c(".rnorm"))
model_id <- paste0(model_id_pfx, ".no.rnorm")
} else model_id <- model_id_pfx
ret_lst <- myfit_mdl(model_id=model_id, model_method=method,
indep_vars_vctr=indep_vars_vctr,
model_type=glb_model_type,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=glb_tune_models_df)
# If All.X.glm is less accurate than Low.Cor.X.glm
# check NA coefficients & filter appropriate terms in indep_vars_vctr
# if (method == "glm") {
# orig_glm <- glb_models_lst[[paste0(model_id, ".", model_method)]]$finalModel
# orig_glm <- glb_models_lst[["All.X.glm"]]$finalModel; print(summary(orig_glm))
# vif_orig_glm <- vif(orig_glm); print(vif_orig_glm)
# print(vif_orig_glm[!is.na(vif_orig_glm) & (vif_orig_glm == Inf)])
# print(which.max(vif_orig_glm))
# print(sort(vif_orig_glm[vif_orig_glm >= 1.0e+03], decreasing=TRUE))
# glb_fitobs_df[c(1143, 3637, 3953, 4105), c("UniqueID", "Popular", "H.P.quandary", "Headline")]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.nchrs.log", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.nchrs.log", glb_feats_df$id, value=TRUE), ]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.npnct14.log", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.npnct14.log", glb_feats_df$id, value=TRUE), ]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.T.scen", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.T.scen", glb_feats_df$id, value=TRUE), ]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.P.first", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.P.first", glb_feats_df$id, value=TRUE), ]
# all.equal(glb_allobs_df$S.nuppr.log, glb_allobs_df$A.nuppr.log)
# all.equal(glb_allobs_df$S.npnct19.log, glb_allobs_df$A.npnct19.log)
# all.equal(glb_allobs_df$S.P.year.colon, glb_allobs_df$A.P.year.colon)
# all.equal(glb_allobs_df$S.T.share, glb_allobs_df$A.T.share)
# all.equal(glb_allobs_df$H.T.clip, glb_allobs_df$H.P.daily.clip.report)
# cor(glb_allobs_df$S.T.herald, glb_allobs_df$S.T.tribun)
# dsp_obs(Abstract.contains="[Dd]iar", cols=("Abstract"), all=TRUE)
# dsp_obs(Abstract.contains="[Ss]hare", cols=("Abstract"), all=TRUE)
# subset(glb_feats_df, cor.y.abs <= glb_feats_df[glb_feats_df$id == ".rnorm", "cor.y.abs"])
# corxx_mtrx <- cor(data.matrix(glb_allobs_df[, setdiff(names(glb_allobs_df), myfind_chr_cols_df(glb_allobs_df))]), use="pairwise.complete.obs"); abs_corxx_mtrx <- abs(corxx_mtrx); diag(abs_corxx_mtrx) <- 0
# which.max(abs_corxx_mtrx["S.T.tribun", ])
# abs_corxx_mtrx["A.npnct08.log", "S.npnct08.log"]
# step_glm <- step(orig_glm)
# }
# Since caret does not optimize rpart well
# if (method == "rpart")
# ret_lst <- myfit_mdl(model_id=paste0(model_id_pfx, ".cp.0"), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
# n_cv_folds=0, tune_models_df=data.frame(parameter="cp", min=0.0, max=0.0, by=0.1))
}
## label step_major step_minor bgn end elapsed
## 1 fit.models_1_bgn 1 0 36.466 36.479 0.013
## 2 fit.models_1_lm 2 0 36.479 NA NA
## [1] "fitting model: All.X.lm"
## [1] " indep_vars: CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, .rnorm, Month, Aerosols"
## Aggregating results
## Fitting final model on full training set
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27254 -0.05743 -0.00018 0.05243 0.32473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.376e+02 5.560e+01 -2.475 0.01392 *
## CO2 2.655e-03 3.247e-03 0.818 0.41422
## Year 6.354e-03 2.121e-02 0.300 0.76471
## N2O -1.594e-02 1.846e-02 -0.863 0.38873
## CH4 1.868e-04 5.323e-04 0.351 0.72585
## CFC.12 3.683e-03 1.752e-03 2.102 0.03648 *
## CFC.11 -6.653e-03 2.318e-03 -2.871 0.00442 **
## TSI 9.427e-02 1.850e-02 5.095 6.52e-07 ***
## MEI 6.471e-02 6.487e-03 9.975 < 2e-16 ***
## .rnorm -6.153e-03 6.053e-03 -1.017 0.31025
## Month -3.706e-03 2.178e-03 -1.701 0.09004 .
## Aerosols -1.568e+00 2.211e-01 -7.090 1.15e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09129 on 272 degrees of freedom
## Multiple R-squared: 0.7559, Adjusted R-squared: 0.746
## F-statistic: 76.56 on 11 and 272 DF, p-value: < 2.2e-16
##
## [1] " calling mypredict_mdl for fit:"
## [1] " calling mypredict_mdl for OOB:"
## model_id model_method
## 1 All.X.lm lm
## feats
## 1 CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, .rnorm, Month, Aerosols
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 0.844 0.005
## max.R.sq.fit min.RMSE.fit max.R.sq.OOB min.RMSE.OOB max.Adj.R.sq.fit
## 1 0.7558699 0.09468322 0.6432956 0.09332638 0.745997
## max.Rsquared.fit min.RMSESD.fit max.RsquaredSD.fit
## 1 0.7270083 0.008506479 0.04795456
## label step_major step_minor bgn end elapsed
## 2 fit.models_1_lm 2 0 36.479 38.741 2.262
## 3 fit.models_1_glm 3 0 38.741 NA NA
## [1] "fitting model: All.X.glm"
## [1] " indep_vars: CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, .rnorm, Month, Aerosols"
## Aggregating results
## Fitting final model on full training set
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.27254 -0.05743 -0.00018 0.05243 0.32473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.376e+02 5.560e+01 -2.475 0.01392 *
## CO2 2.655e-03 3.247e-03 0.818 0.41422
## Year 6.354e-03 2.121e-02 0.300 0.76471
## N2O -1.594e-02 1.846e-02 -0.863 0.38873
## CH4 1.868e-04 5.323e-04 0.351 0.72585
## CFC.12 3.683e-03 1.752e-03 2.102 0.03648 *
## CFC.11 -6.653e-03 2.318e-03 -2.871 0.00442 **
## TSI 9.427e-02 1.850e-02 5.095 6.52e-07 ***
## MEI 6.471e-02 6.487e-03 9.975 < 2e-16 ***
## .rnorm -6.153e-03 6.053e-03 -1.017 0.31025
## Month -3.706e-03 2.178e-03 -1.701 0.09004 .
## Aerosols -1.568e+00 2.211e-01 -7.090 1.15e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.008333862)
##
## Null deviance: 9.2853 on 283 degrees of freedom
## Residual deviance: 2.2668 on 272 degrees of freedom
## AIC: -539.93
##
## Number of Fisher Scoring iterations: 2
##
## [1] " calling mypredict_mdl for fit:"
## [1] " calling mypredict_mdl for OOB:"
## model_id model_method
## 1 All.X.glm glm
## feats
## 1 CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, .rnorm, Month, Aerosols
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 0.877 0.019
## max.R.sq.fit min.RMSE.fit max.R.sq.OOB min.RMSE.OOB min.aic.fit
## 1 0.7558699 0.09468322 0.6432956 0.09332638 -539.9334
## max.Rsquared.fit min.RMSESD.fit max.RsquaredSD.fit
## 1 0.7270083 0.008506479 0.04795456
## label step_major step_minor bgn end elapsed
## 3 fit.models_1_glm 3 0 38.741 41.977 3.237
## 4 fit.models_1_bayesglm 4 0 41.982 NA NA
## [1] "fitting model: All.X.bayesglm"
## [1] " indep_vars: CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, .rnorm, Month, Aerosols"
## Loading required package: arm
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## Loading required package: Matrix
## Loading required package: lme4
## Loading required package: Rcpp
##
## arm (Version 1.8-5, built: 2015-05-13)
##
## Working directory is /Users/bbalaji-2012/Documents/Work/Courses/MIT/Analytics_Edge_15_071x/Assignments/HW2_GLB_Climate
## Aggregating results
## Fitting final model on full training set
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.27327 -0.05646 0.00005 0.05321 0.32523
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.396e+02 5.551e+01 -2.515 0.01247 *
## CO2 2.654e-03 3.246e-03 0.817 0.41442
## Year 7.444e-03 2.117e-02 0.352 0.72537
## N2O -1.690e-02 1.843e-02 -0.917 0.35984
## CH4 1.989e-04 5.322e-04 0.374 0.70887
## CFC.12 3.612e-03 1.750e-03 2.064 0.03998 *
## CFC.11 -6.583e-03 2.316e-03 -2.842 0.00482 **
## TSI 9.435e-02 1.849e-02 5.104 6.27e-07 ***
## MEI 6.419e-02 6.474e-03 9.916 < 2e-16 ***
## .rnorm -6.013e-03 6.052e-03 -0.994 0.32129
## Month -3.630e-03 2.177e-03 -1.668 0.09656 .
## Aerosols -1.521e+00 2.178e-01 -6.984 2.20e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.008335232)
##
## Null deviance: 9.2853 on 283 degrees of freedom
## Residual deviance: 2.2672 on 272 degrees of freedom
## AIC: -539.89
##
## Number of Fisher Scoring iterations: 7
##
## [1] " calling mypredict_mdl for fit:"
## [1] " calling mypredict_mdl for OOB:"
## model_id model_method
## 1 All.X.bayesglm bayesglm
## feats
## 1 CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, .rnorm, Month, Aerosols
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 2.046 0.029
## max.R.sq.fit min.RMSE.fit max.R.sq.OOB min.RMSE.OOB min.aic.fit
## 1 0.7558298 0.0946489 0.6403082 0.09371637 -539.8868
## max.Rsquared.fit min.RMSESD.fit max.RsquaredSD.fit
## 1 0.7271964 0.008485447 0.04832533
## label step_major step_minor bgn end elapsed
## 4 fit.models_1_bayesglm 4 0 41.982 45.929 3.947
## 5 fit.models_1_rpart 5 0 45.930 NA NA
## [1] "fitting model: All.X.no.rnorm.rpart"
## [1] " indep_vars: CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, Month, Aerosols"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.0368 on full training set
## Warning in myfit_mdl(model_id = model_id, model_method = method,
## indep_vars_vctr = indep_vars_vctr, : model's bestTune found at an extreme
## of tuneGrid for parameter: cp
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 284
##
## CP nsplit rel error
## 1 0.62580255 0 1.0000000
## 2 0.07878880 1 0.3741975
## 3 0.03681088 2 0.2954087
##
## Variable importance
## CFC.12 Year N2O CO2 Aerosols CH4 CFC.11
## 18 17 17 16 15 14 2
##
## Node number 1: 284 observations, complexity param=0.6258025
## mean=0.2477993, MSE=0.03269456
## left son=2 (169 obs) right son=3 (115 obs)
## Primary splits:
## CFC.12 < 532.5135 to the left, improve=0.6258025, (0 missing)
## N2O < 312.5435 to the left, improve=0.6204381, (0 missing)
## Year < 1996.5 to the left, improve=0.6106580, (0 missing)
## Aerosols < 0.00545 to the right, improve=0.5944675, (0 missing)
## CO2 < 364.25 to the left, improve=0.5456846, (0 missing)
## Surrogate splits:
## N2O < 312.5435 to the left, agree=0.993, adj=0.983, (0 split)
## Year < 1996.5 to the left, agree=0.982, adj=0.957, (0 split)
## Aerosols < 0.00515 to the right, agree=0.979, adj=0.948, (0 split)
## CO2 < 364.25 to the left, agree=0.961, adj=0.904, (0 split)
## CH4 < 1764.905 to the left, agree=0.912, adj=0.783, (0 split)
##
## Node number 2: 169 observations, complexity param=0.0787888
## mean=0.1298047, MSE=0.01313109
## left son=4 (44 obs) right son=5 (125 obs)
## Primary splits:
## Year < 1986.5 to the left, improve=0.3296634, (0 missing)
## CFC.12 < 412.016 to the left, improve=0.3296634, (0 missing)
## CFC.11 < 229.452 to the left, improve=0.3296634, (0 missing)
## CH4 < 1698.185 to the left, improve=0.2487246, (0 missing)
## N2O < 306.069 to the left, improve=0.2443602, (0 missing)
## Surrogate splits:
## CFC.12 < 412.016 to the left, agree=1.000, adj=1.000, (0 split)
## CFC.11 < 229.452 to the left, agree=1.000, adj=1.000, (0 split)
## CH4 < 1686.9 to the left, agree=0.959, adj=0.841, (0 split)
## CO2 < 348.29 to the left, agree=0.947, adj=0.795, (0 split)
## N2O < 305.7935 to the left, agree=0.923, adj=0.705, (0 split)
##
## Node number 3: 115 observations
## mean=0.4212, MSE=0.01091621
##
## Node number 4: 44 observations
## mean=0.01890909, MSE=0.007631264
##
## Node number 5: 125 observations
## mean=0.16884, MSE=0.009214438
##
## n= 284
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 284 9.2852560 0.24779930
## 2) CFC.12< 532.5135 169 2.2191550 0.12980470
## 4) Year< 1986.5 44 0.3357756 0.01890909 *
## 5) Year>=1986.5 125 1.1518050 0.16884000 *
## 3) CFC.12>=532.5135 115 1.2553640 0.42120000 *
## [1] " calling mypredict_mdl for fit:"
## [1] " calling mypredict_mdl for OOB:"
## model_id model_method
## 1 All.X.no.rnorm.rpart rpart
## feats
## 1 CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, Month, Aerosols
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 3 0.917 0.027
## max.R.sq.fit min.RMSE.fit max.R.sq.OOB min.RMSE.OOB max.Rsquared.fit
## 1 0.7045913 0.1007468 0.4047909 0.1205549 0.692982
## min.RMSESD.fit max.RsquaredSD.fit
## 1 0.00352263 0.008054921
## label step_major step_minor bgn end elapsed
## 5 fit.models_1_rpart 5 0 45.930 48.967 3.037
## 6 fit.models_1_rf 6 0 48.967 NA NA
## [1] "fitting model: All.X.no.rnorm.rf"
## [1] " indep_vars: CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, Month, Aerosols"
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
## Warning in myfit_mdl(model_id = model_id, model_method = method,
## indep_vars_vctr = indep_vars_vctr, : model's bestTune found at an extreme
## of tuneGrid for parameter: mtry
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 284 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 284 -none- numeric
## importance 10 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 284 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 10 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 1 -none- logical
## [1] " calling mypredict_mdl for fit:"
## [1] " calling mypredict_mdl for OOB:"
## model_id model_method
## 1 All.X.no.rnorm.rf rf
## feats
## 1 CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, Month, Aerosols
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 3 2.559 0.31
## max.R.sq.fit min.RMSE.fit max.R.sq.OOB min.RMSE.OOB max.Rsquared.fit
## 1 0.9704364 0.07275241 0.3658896 0.124516 0.8404313
## min.RMSESD.fit max.RsquaredSD.fit
## 1 0.003631677 0.01949359
# User specified
# Ensure at least 2 vars in each regression; else varImp crashes
# sav_models_lst <- glb_models_lst; sav_models_df <- glb_models_df; sav_featsimp_df <- glb_featsimp_df
# glb_models_lst <- sav_models_lst; glb_models_df <- sav_models_df; glm_featsimp_df <- sav_featsimp_df
# easier to exclude features
#model_id_pfx <- "";
# indep_vars_vctr <- setdiff(names(glb_fitobs_df),
# union(union(glb_rsp_var, glb_exclude_vars_as_features),
# c("<feat1_name>", "<feat2_name>")))
# method <- ""
# easier to include features
#model_id <- "Rank9.2"; indep_vars_vctr <- c(NULL
# ,"<feat1>"
# ,"<feat1>*<feat2>"
# ,"<feat1>:<feat2>"
# )
# for (method in c("bayesglm")) {
# ret_lst <- myfit_mdl(model_id=model_id, model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
# n_cv_folds=glb_n_cv_folds, tune_models_df=glb_tune_models_df)
# csm_mdl_id <- paste0(model_id, ".", method)
# csm_featsimp_df <- myget_feats_importance(glb_models_lst[[paste0(model_id, ".", method)]]); print(head(csm_featsimp_df))
# }
#print(dsp_models_df <- orderBy(model_sel_frmla, glb_models_df)[, dsp_models_cols])
#csm_featsimp_df[grepl("H.npnct19.log", row.names(csm_featsimp_df)), , FALSE]
#csm_OOBobs_df <- glb_get_predictions(glb_OOBobs_df, mdl_id=csm_mdl_id, rsp_var_out=glb_rsp_var_out, prob_threshold_def=glb_models_df[glb_models_df$model_id == csm_mdl_id, "opt.prob.threshold.OOB"])
#print(sprintf("%s OOB confusion matrix & accuracy: ", csm_mdl_id)); print(t(confusionMatrix(csm_OOBobs_df[, paste0(glb_rsp_var_out, csm_mdl_id)], csm_OOBobs_df[, glb_rsp_var])$table))
#glb_models_df[, "max.Accuracy.OOB", FALSE]
#varImp(glb_models_lst[["Low.cor.X.glm"]])
#orderBy(~ -Overall, varImp(glb_models_lst[["All.X.2.glm"]])$importance)
#orderBy(~ -Overall, varImp(glb_models_lst[["All.X.3.glm"]])$importance)
#glb_feats_df[grepl("npnct28", glb_feats_df$id), ]
#print(sprintf("%s OOB confusion matrix & accuracy: ", glb_sel_mdl_id)); print(t(confusionMatrix(glb_OOBobs_df[, paste0(glb_rsp_var_out, glb_sel_mdl_id)], glb_OOBobs_df[, glb_rsp_var])$table))
# User specified bivariate models
# indep_vars_vctr_lst <- list()
# for (feat in setdiff(names(glb_fitobs_df),
# union(glb_rsp_var, glb_exclude_vars_as_features)))
# indep_vars_vctr_lst[["feat"]] <- feat
# User specified combinatorial models
# indep_vars_vctr_lst <- list()
# combn_mtrx <- combn(c("<feat1_name>", "<feat2_name>", "<featn_name>"),
# <num_feats_to_choose>)
# for (combn_ix in 1:ncol(combn_mtrx))
# #print(combn_mtrx[, combn_ix])
# indep_vars_vctr_lst[[combn_ix]] <- combn_mtrx[, combn_ix]
# template for myfit_mdl
# rf is hard-coded in caret to recognize only Accuracy / Kappa evaluation metrics
# only for OOB in trainControl ?
# ret_lst <- myfit_mdl_fn(model_id=paste0(model_id_pfx, ""), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
# n_cv_folds=glb_n_cv_folds, tune_models_df=glb_tune_models_df,
# model_loss_mtrx=glb_model_metric_terms,
# model_summaryFunction=glb_model_metric_smmry,
# model_metric=glb_model_metric,
# model_metric_maximize=glb_model_metric_maximize)
# Simplify a model
# fit_df <- glb_fitobs_df; glb_mdl <- step(<complex>_mdl)
# Non-caret models
# rpart_area_mdl <- rpart(reformulate("Area", response=glb_rsp_var),
# data=glb_fitobs_df, #method="class",
# control=rpart.control(cp=0.12),
# parms=list(loss=glb_model_metric_terms))
# print("rpart_sel_wlm_mdl"); prp(rpart_sel_wlm_mdl)
#
print(glb_models_df)
## model_id model_method
## MFO.lm MFO.lm lm
## Max.cor.Y.cv.0.rpart Max.cor.Y.cv.0.rpart rpart
## Max.cor.Y.cv.0.cp.0.rpart Max.cor.Y.cv.0.cp.0.rpart rpart
## Max.cor.Y.rpart Max.cor.Y.rpart rpart
## Max.cor.Y.lm Max.cor.Y.lm lm
## Interact.High.cor.Y.lm Interact.High.cor.Y.lm lm
## Low.cor.X.lm Low.cor.X.lm lm
## All.X.lm All.X.lm lm
## All.X.glm All.X.glm glm
## All.X.bayesglm All.X.bayesglm bayesglm
## All.X.no.rnorm.rpart All.X.no.rnorm.rpart rpart
## All.X.no.rnorm.rf All.X.no.rnorm.rf rf
## feats
## MFO.lm .rnorm
## Max.cor.Y.cv.0.rpart CO2, CFC.11
## Max.cor.Y.cv.0.cp.0.rpart CO2, CFC.11
## Max.cor.Y.rpart CO2, CFC.11
## Max.cor.Y.lm CO2, CFC.11
## Interact.High.cor.Y.lm CO2, CFC.11, CO2:CO2, CO2:Year, CO2:CH4
## Low.cor.X.lm CO2, CFC.11, TSI, MEI, .rnorm, Month, Aerosols
## All.X.lm CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, .rnorm, Month, Aerosols
## All.X.glm CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, .rnorm, Month, Aerosols
## All.X.bayesglm CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, .rnorm, Month, Aerosols
## All.X.no.rnorm.rpart CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, Month, Aerosols
## All.X.no.rnorm.rf CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, Month, Aerosols
## max.nTuningRuns min.elapsedtime.everything
## MFO.lm 0 0.458
## Max.cor.Y.cv.0.rpart 0 0.785
## Max.cor.Y.cv.0.cp.0.rpart 0 0.446
## Max.cor.Y.rpart 3 0.968
## Max.cor.Y.lm 1 0.831
## Interact.High.cor.Y.lm 1 0.825
## Low.cor.X.lm 1 0.852
## All.X.lm 1 0.844
## All.X.glm 1 0.877
## All.X.bayesglm 1 2.046
## All.X.no.rnorm.rpart 3 0.917
## All.X.no.rnorm.rf 3 2.559
## min.elapsedtime.final max.R.sq.fit min.RMSE.fit
## MFO.lm 0.003 0.001691565 0.18066338
## Max.cor.Y.cv.0.rpart 0.012 0.000000000 0.18081638
## Max.cor.Y.cv.0.cp.0.rpart 0.009 0.861511889 0.06728899
## Max.cor.Y.rpart 0.011 0.634928216 0.12421767
## Max.cor.Y.lm 0.002 0.621785879 0.11276298
## Interact.High.cor.Y.lm 0.003 0.625848093 0.11451237
## Low.cor.X.lm 0.004 0.734978272 0.09714225
## All.X.lm 0.005 0.755869879 0.09468322
## All.X.glm 0.019 0.755869879 0.09468322
## All.X.bayesglm 0.029 0.755829766 0.09464890
## All.X.no.rnorm.rpart 0.027 0.704591347 0.10074676
## All.X.no.rnorm.rf 0.310 0.970436391 0.07275241
## max.R.sq.OOB min.RMSE.OOB max.Adj.R.sq.fit
## MFO.lm -0.01939772 0.15776912 -0.001848536
## Max.cor.Y.cv.0.rpart 0.00000000 0.15626085 NA
## Max.cor.Y.cv.0.cp.0.rpart 0.40828886 0.12020016 NA
## Max.cor.Y.rpart 0.45856660 0.11498010 NA
## Max.cor.Y.lm -0.78036504 0.20849935 0.619093963
## Interact.High.cor.Y.lm -0.79231656 0.20919800 0.620483908
## Low.cor.X.lm 0.37144246 0.12388613 0.728256707
## All.X.lm 0.64329559 0.09332638 0.745996970
## All.X.glm 0.64329559 0.09332638 NA
## All.X.bayesglm 0.64030820 0.09371637 NA
## All.X.no.rnorm.rpart 0.40479089 0.12055493 NA
## All.X.no.rnorm.rf 0.36588961 0.12451600 NA
## max.Rsquared.fit min.RMSESD.fit
## MFO.lm NA NA
## Max.cor.Y.cv.0.rpart NA NA
## Max.cor.Y.cv.0.cp.0.rpart NA NA
## Max.cor.Y.rpart 0.5396945 0.009132685
## Max.cor.Y.lm 0.6189480 0.010116867
## Interact.High.cor.Y.lm 0.6072395 0.009471343
## Low.cor.X.lm 0.7145240 0.009508754
## All.X.lm 0.7270083 0.008506479
## All.X.glm 0.7270083 0.008506479
## All.X.bayesglm 0.7271964 0.008485447
## All.X.no.rnorm.rpart 0.6929820 0.003522630
## All.X.no.rnorm.rf 0.8404313 0.003631677
## max.RsquaredSD.fit min.aic.fit
## MFO.lm NA NA
## Max.cor.Y.cv.0.rpart NA NA
## Max.cor.Y.cv.0.cp.0.rpart NA NA
## Max.cor.Y.rpart 0.061117938 NA
## Max.cor.Y.lm 0.061606706 NA
## Interact.High.cor.Y.lm 0.060068564 NA
## Low.cor.X.lm 0.054519087 NA
## All.X.lm 0.047954563 NA
## All.X.glm 0.047954563 -539.9334
## All.X.bayesglm 0.048325331 -539.8868
## All.X.no.rnorm.rpart 0.008054921 NA
## All.X.no.rnorm.rf 0.019493587 NA
rm(ret_lst)
fit.models_1_chunk_df <- myadd_chunk(fit.models_1_chunk_df, "fit.models_1_end",
major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 6 fit.models_1_rf 6 0 48.967 53.129 4.163
## 7 fit.models_1_end 7 0 53.130 NA NA
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 11 fit.models 7 1 34.275 53.135 18.86
## 12 fit.models 7 2 53.136 NA NA
if (!is.null(glb_model_metric_smmry)) {
stats_df <- glb_models_df[, "model_id", FALSE]
stats_mdl_df <- data.frame()
for (model_id in stats_df$model_id) {
stats_mdl_df <- rbind(stats_mdl_df,
mypredict_mdl(glb_models_lst[[model_id]], glb_fitobs_df, glb_rsp_var,
glb_rsp_var_out, model_id, "fit",
glb_model_metric_smmry, glb_model_metric,
glb_model_metric_maximize, ret_type="stats"))
}
stats_df <- merge(stats_df, stats_mdl_df, all.x=TRUE)
stats_mdl_df <- data.frame()
for (model_id in stats_df$model_id) {
stats_mdl_df <- rbind(stats_mdl_df,
mypredict_mdl(glb_models_lst[[model_id]], glb_OOBobs_df, glb_rsp_var,
glb_rsp_var_out, model_id, "OOB",
glb_model_metric_smmry, glb_model_metric,
glb_model_metric_maximize, ret_type="stats"))
}
stats_df <- merge(stats_df, stats_mdl_df, all.x=TRUE)
print("Merging following data into glb_models_df:")
print(stats_mrg_df <- stats_df[, c(1, grep(glb_model_metric, names(stats_df)))])
print(tmp_models_df <- orderBy(~model_id, glb_models_df[, c("model_id",
grep(glb_model_metric, names(stats_df), value=TRUE))]))
tmp2_models_df <- glb_models_df[, c("model_id", setdiff(names(glb_models_df),
grep(glb_model_metric, names(stats_df), value=TRUE)))]
tmp3_models_df <- merge(tmp2_models_df, stats_mrg_df, all.x=TRUE, sort=FALSE)
print(tmp3_models_df)
print(names(tmp3_models_df))
print(glb_models_df <- subset(tmp3_models_df, select=-model_id.1))
}
plt_models_df <- glb_models_df[, -grep("SD|Upper|Lower", names(glb_models_df))]
for (var in grep("^min.", names(plt_models_df), value=TRUE)) {
plt_models_df[, sub("min.", "inv.", var)] <-
#ifelse(all(is.na(tmp <- plt_models_df[, var])), NA, 1.0 / tmp)
1.0 / plt_models_df[, var]
plt_models_df <- plt_models_df[ , -grep(var, names(plt_models_df))]
}
print(plt_models_df)
## model_id model_method
## MFO.lm MFO.lm lm
## Max.cor.Y.cv.0.rpart Max.cor.Y.cv.0.rpart rpart
## Max.cor.Y.cv.0.cp.0.rpart Max.cor.Y.cv.0.cp.0.rpart rpart
## Max.cor.Y.rpart Max.cor.Y.rpart rpart
## Max.cor.Y.lm Max.cor.Y.lm lm
## Interact.High.cor.Y.lm Interact.High.cor.Y.lm lm
## Low.cor.X.lm Low.cor.X.lm lm
## All.X.lm All.X.lm lm
## All.X.glm All.X.glm glm
## All.X.bayesglm All.X.bayesglm bayesglm
## All.X.no.rnorm.rpart All.X.no.rnorm.rpart rpart
## All.X.no.rnorm.rf All.X.no.rnorm.rf rf
## feats
## MFO.lm .rnorm
## Max.cor.Y.cv.0.rpart CO2, CFC.11
## Max.cor.Y.cv.0.cp.0.rpart CO2, CFC.11
## Max.cor.Y.rpart CO2, CFC.11
## Max.cor.Y.lm CO2, CFC.11
## Interact.High.cor.Y.lm CO2, CFC.11, CO2:CO2, CO2:Year, CO2:CH4
## Low.cor.X.lm CO2, CFC.11, TSI, MEI, .rnorm, Month, Aerosols
## All.X.lm CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, .rnorm, Month, Aerosols
## All.X.glm CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, .rnorm, Month, Aerosols
## All.X.bayesglm CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, .rnorm, Month, Aerosols
## All.X.no.rnorm.rpart CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, Month, Aerosols
## All.X.no.rnorm.rf CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, Month, Aerosols
## max.nTuningRuns max.R.sq.fit max.R.sq.OOB
## MFO.lm 0 0.001691565 -0.01939772
## Max.cor.Y.cv.0.rpart 0 0.000000000 0.00000000
## Max.cor.Y.cv.0.cp.0.rpart 0 0.861511889 0.40828886
## Max.cor.Y.rpart 3 0.634928216 0.45856660
## Max.cor.Y.lm 1 0.621785879 -0.78036504
## Interact.High.cor.Y.lm 1 0.625848093 -0.79231656
## Low.cor.X.lm 1 0.734978272 0.37144246
## All.X.lm 1 0.755869879 0.64329559
## All.X.glm 1 0.755869879 0.64329559
## All.X.bayesglm 1 0.755829766 0.64030820
## All.X.no.rnorm.rpart 3 0.704591347 0.40479089
## All.X.no.rnorm.rf 3 0.970436391 0.36588961
## max.Adj.R.sq.fit max.Rsquared.fit
## MFO.lm -0.001848536 NA
## Max.cor.Y.cv.0.rpart NA NA
## Max.cor.Y.cv.0.cp.0.rpart NA NA
## Max.cor.Y.rpart NA 0.5396945
## Max.cor.Y.lm 0.619093963 0.6189480
## Interact.High.cor.Y.lm 0.620483908 0.6072395
## Low.cor.X.lm 0.728256707 0.7145240
## All.X.lm 0.745996970 0.7270083
## All.X.glm NA 0.7270083
## All.X.bayesglm NA 0.7271964
## All.X.no.rnorm.rpart NA 0.6929820
## All.X.no.rnorm.rf NA 0.8404313
## inv.elapsedtime.everything inv.elapsedtime.final
## MFO.lm 2.1834061 333.333333
## Max.cor.Y.cv.0.rpart 1.2738854 83.333333
## Max.cor.Y.cv.0.cp.0.rpart 2.2421525 111.111111
## Max.cor.Y.rpart 1.0330579 90.909091
## Max.cor.Y.lm 1.2033694 500.000000
## Interact.High.cor.Y.lm 1.2121212 333.333333
## Low.cor.X.lm 1.1737089 250.000000
## All.X.lm 1.1848341 200.000000
## All.X.glm 1.1402509 52.631579
## All.X.bayesglm 0.4887586 34.482759
## All.X.no.rnorm.rpart 1.0905125 37.037037
## All.X.no.rnorm.rf 0.3907776 3.225806
## inv.RMSE.fit inv.RMSE.OOB inv.aic.fit
## MFO.lm 5.535156 6.338376 NA
## Max.cor.Y.cv.0.rpart 5.530473 6.399556 NA
## Max.cor.Y.cv.0.cp.0.rpart 14.861272 8.319456 NA
## Max.cor.Y.rpart 8.050384 8.697157 NA
## Max.cor.Y.lm 8.868159 4.796178 NA
## Interact.High.cor.Y.lm 8.732681 4.780160 NA
## Low.cor.X.lm 10.294182 8.071929 NA
## All.X.lm 10.561534 10.715084 NA
## All.X.glm 10.561534 10.715084 -0.00185208
## All.X.bayesglm 10.565364 10.670495 -0.00185224
## All.X.no.rnorm.rpart 9.925878 8.294974 NA
## All.X.no.rnorm.rf 13.745250 8.031096 NA
print(myplot_radar(radar_inp_df=plt_models_df))
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 12. Consider specifying shapes manually. if you must have them.
## Warning in loop_apply(n, do.ply): Removed 7 rows containing missing values
## (geom_path).
## Warning in loop_apply(n, do.ply): Removed 68 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 20 rows containing missing values
## (geom_text).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 12. Consider specifying shapes manually. if you must have them.
# print(myplot_radar(radar_inp_df=subset(plt_models_df,
# !(model_id %in% grep("random|MFO", plt_models_df$model_id, value=TRUE)))))
# Compute CI for <metric>SD
glb_models_df <- mutate(glb_models_df,
max.df = ifelse(max.nTuningRuns > 1, max.nTuningRuns - 1, NA),
min.sd2ci.scaler = ifelse(is.na(max.df), NA, qt(0.975, max.df)))
for (var in grep("SD", names(glb_models_df), value=TRUE)) {
# Does CI alredy exist ?
var_components <- unlist(strsplit(var, "SD"))
varActul <- paste0(var_components[1], var_components[2])
varUpper <- paste0(var_components[1], "Upper", var_components[2])
varLower <- paste0(var_components[1], "Lower", var_components[2])
if (varUpper %in% names(glb_models_df)) {
warning(varUpper, " already exists in glb_models_df")
# Assuming Lower also exists
next
}
print(sprintf("var:%s", var))
# CI is dependent on sample size in t distribution; df=n-1
glb_models_df[, varUpper] <- glb_models_df[, varActul] +
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
glb_models_df[, varLower] <- glb_models_df[, varActul] -
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
}
## [1] "var:min.RMSESD.fit"
## [1] "var:max.RsquaredSD.fit"
# Plot metrics with CI
plt_models_df <- glb_models_df[, "model_id", FALSE]
pltCI_models_df <- glb_models_df[, "model_id", FALSE]
for (var in grep("Upper", names(glb_models_df), value=TRUE)) {
var_components <- unlist(strsplit(var, "Upper"))
col_name <- unlist(paste(var_components, collapse=""))
plt_models_df[, col_name] <- glb_models_df[, col_name]
for (name in paste0(var_components[1], c("Upper", "Lower"), var_components[2]))
pltCI_models_df[, name] <- glb_models_df[, name]
}
build_statsCI_data <- function(plt_models_df) {
mltd_models_df <- melt(plt_models_df, id.vars="model_id")
mltd_models_df$data <- sapply(1:nrow(mltd_models_df),
function(row_ix) tail(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]), "[.]")), 1))
mltd_models_df$label <- sapply(1:nrow(mltd_models_df),
function(row_ix) head(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]),
paste0(".", mltd_models_df[row_ix, "data"]))), 1))
#print(mltd_models_df)
return(mltd_models_df)
}
mltd_models_df <- build_statsCI_data(plt_models_df)
mltdCI_models_df <- melt(pltCI_models_df, id.vars="model_id")
for (row_ix in 1:nrow(mltdCI_models_df)) {
for (type in c("Upper", "Lower")) {
if (length(var_components <- unlist(strsplit(
as.character(mltdCI_models_df[row_ix, "variable"]), type))) > 1) {
#print(sprintf("row_ix:%d; type:%s; ", row_ix, type))
mltdCI_models_df[row_ix, "label"] <- var_components[1]
mltdCI_models_df[row_ix, "data"] <-
unlist(strsplit(var_components[2], "[.]"))[2]
mltdCI_models_df[row_ix, "type"] <- type
break
}
}
}
wideCI_models_df <- reshape(subset(mltdCI_models_df, select=-variable),
timevar="type",
idvar=setdiff(names(mltdCI_models_df), c("type", "value", "variable")),
direction="wide")
#print(wideCI_models_df)
mrgdCI_models_df <- merge(wideCI_models_df, mltd_models_df, all.x=TRUE)
#print(mrgdCI_models_df)
# Merge stats back in if CIs don't exist
goback_vars <- c()
for (var in unique(mltd_models_df$label)) {
for (type in unique(mltd_models_df$data)) {
var_type <- paste0(var, ".", type)
# if this data is already present, next
if (var_type %in% unique(paste(mltd_models_df$label, mltd_models_df$data,
sep=".")))
next
#print(sprintf("var_type:%s", var_type))
goback_vars <- c(goback_vars, var_type)
}
}
if (length(goback_vars) > 0) {
mltd_goback_df <- build_statsCI_data(glb_models_df[, c("model_id", goback_vars)])
mltd_models_df <- rbind(mltd_models_df, mltd_goback_df)
}
mltd_models_df <- merge(mltd_models_df, glb_models_df[, c("model_id", "model_method")],
all.x=TRUE)
png(paste0(glb_out_pfx, "models_bar.png"), width=480*3, height=480*2)
print(gp <- myplot_bar(mltd_models_df, "model_id", "value", colorcol_name="model_method") +
geom_errorbar(data=mrgdCI_models_df,
mapping=aes(x=model_id, ymax=value.Upper, ymin=value.Lower), width=0.5) +
facet_grid(label ~ data, scales="free") +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5)))
## Warning in loop_apply(n, do.ply): Removed 3 rows containing missing values
## (position_stack).
dev.off()
## quartz_off_screen
## 2
print(gp)
## Warning in loop_apply(n, do.ply): Removed 3 rows containing missing values
## (position_stack).
# used for console inspection
model_evl_terms <- c(NULL)
for (metric in glb_model_evl_criteria)
model_evl_terms <- c(model_evl_terms,
ifelse(length(grep("max", metric)) > 0, "-", "+"), metric)
if (glb_is_classification && glb_is_binomial)
model_evl_terms <- c(model_evl_terms, "-", "opt.prob.threshold.OOB")
model_sel_frmla <- as.formula(paste(c("~ ", model_evl_terms), collapse=" "))
dsp_models_cols <- c("model_id", glb_model_evl_criteria)
if (glb_is_classification && glb_is_binomial)
dsp_models_cols <- c(dsp_models_cols, "opt.prob.threshold.OOB")
print(dsp_models_df <- orderBy(model_sel_frmla, glb_models_df)[, dsp_models_cols])
## model_id min.RMSE.OOB max.R.sq.OOB max.Adj.R.sq.fit
## 9 All.X.glm 0.09332638 0.64329559 NA
## 8 All.X.lm 0.09332638 0.64329559 0.745996970
## 10 All.X.bayesglm 0.09371637 0.64030820 NA
## 4 Max.cor.Y.rpart 0.11498010 0.45856660 NA
## 3 Max.cor.Y.cv.0.cp.0.rpart 0.12020016 0.40828886 NA
## 11 All.X.no.rnorm.rpart 0.12055493 0.40479089 NA
## 7 Low.cor.X.lm 0.12388613 0.37144246 0.728256707
## 12 All.X.no.rnorm.rf 0.12451600 0.36588961 NA
## 2 Max.cor.Y.cv.0.rpart 0.15626085 0.00000000 NA
## 1 MFO.lm 0.15776912 -0.01939772 -0.001848536
## 5 Max.cor.Y.lm 0.20849935 -0.78036504 0.619093963
## 6 Interact.High.cor.Y.lm 0.20919800 -0.79231656 0.620483908
print(myplot_radar(radar_inp_df=dsp_models_df))
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 12. Consider specifying shapes manually. if you must have them.
## Warning in loop_apply(n, do.ply): Removed 6 rows containing missing values
## (geom_path).
## Warning in loop_apply(n, do.ply): Removed 25 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 7 rows containing missing values
## (geom_text).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 12. Consider specifying shapes manually. if you must have them.
print("Metrics used for model selection:"); print(model_sel_frmla)
## [1] "Metrics used for model selection:"
## ~+min.RMSE.OOB - max.R.sq.OOB - max.Adj.R.sq.fit
print(sprintf("Best model id: %s", dsp_models_df[1, "model_id"]))
## [1] "Best model id: All.X.glm"
if (is.null(glb_sel_mdl_id)) {
glb_sel_mdl_id <- dsp_models_df[1, "model_id"]
if (glb_sel_mdl_id == "Interact.High.cor.Y.glm") {
warning("glb_sel_mdl_id: Interact.High.cor.Y.glm; myextract_mdl_feats does not currently support interaction terms")
glb_sel_mdl_id <- dsp_models_df[2, "model_id"]
}
} else
print(sprintf("User specified selection: %s", glb_sel_mdl_id))
myprint_mdl(glb_sel_mdl <- glb_models_lst[[glb_sel_mdl_id]])
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.27254 -0.05743 -0.00018 0.05243 0.32473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.376e+02 5.560e+01 -2.475 0.01392 *
## CO2 2.655e-03 3.247e-03 0.818 0.41422
## Year 6.354e-03 2.121e-02 0.300 0.76471
## N2O -1.594e-02 1.846e-02 -0.863 0.38873
## CH4 1.868e-04 5.323e-04 0.351 0.72585
## CFC.12 3.683e-03 1.752e-03 2.102 0.03648 *
## CFC.11 -6.653e-03 2.318e-03 -2.871 0.00442 **
## TSI 9.427e-02 1.850e-02 5.095 6.52e-07 ***
## MEI 6.471e-02 6.487e-03 9.975 < 2e-16 ***
## .rnorm -6.153e-03 6.053e-03 -1.017 0.31025
## Month -3.706e-03 2.178e-03 -1.701 0.09004 .
## Aerosols -1.568e+00 2.211e-01 -7.090 1.15e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.008333862)
##
## Null deviance: 9.2853 on 283 degrees of freedom
## Residual deviance: 2.2668 on 272 degrees of freedom
## AIC: -539.93
##
## Number of Fisher Scoring iterations: 2
## [1] TRUE
# From here to save(), this should all be in one function
# these are executed in the same seq twice more:
# fit.data.training & predict.data.new chunks
glb_get_predictions <- function(df, mdl_id, rsp_var_out, prob_threshold_def=NULL) {
mdl <- glb_models_lst[[mdl_id]]
rsp_var_out <- paste0(rsp_var_out, mdl_id)
if (glb_is_regression) {
df[, rsp_var_out] <- predict(mdl, newdata=df, type="raw")
print(myplot_scatter(df, glb_rsp_var, rsp_var_out, smooth=TRUE))
df[, paste0(rsp_var_out, ".err")] <-
abs(df[, rsp_var_out] - df[, glb_rsp_var])
print(head(orderBy(reformulate(c("-", paste0(rsp_var_out, ".err"))),
df)))
}
if (glb_is_classification && glb_is_binomial) {
prob_threshold <- glb_models_df[glb_models_df$model_id == mdl_id,
"opt.prob.threshold.OOB"]
if (is.null(prob_threshold) || is.na(prob_threshold)) {
warning("Using default probability threshold: ", prob_threshold_def)
if (is.null(prob_threshold <- prob_threshold_def))
stop("Default probability threshold is NULL")
}
df[, paste0(rsp_var_out, ".prob")] <-
predict(mdl, newdata=df, type="prob")[, 2]
df[, rsp_var_out] <-
factor(levels(df[, glb_rsp_var])[
(df[, paste0(rsp_var_out, ".prob")] >=
prob_threshold) * 1 + 1], levels(df[, glb_rsp_var]))
# prediction stats already reported by myfit_mdl ???
}
if (glb_is_classification && !glb_is_binomial) {
df[, rsp_var_out] <- predict(mdl, newdata=df, type="raw")
df[, paste0(rsp_var_out, ".prob")] <-
predict(mdl, newdata=df, type="prob")
}
return(df)
}
glb_OOBobs_df <- glb_get_predictions(df=glb_OOBobs_df, mdl_id=glb_sel_mdl_id,
rsp_var_out=glb_rsp_var_out)
## Year Month MEI CO2 CH4 N2O CFC.11 CFC.12 TSI
## 297 2008 1 -1.011 385.44 1809.92 321.328 246.183 536.876 1365.716
## 298 2008 2 -1.402 385.73 1803.45 321.345 245.898 536.484 1365.737
## 301 2008 5 -0.355 388.50 1796.43 321.420 244.914 535.399 1365.717
## 299 2008 3 -1.635 385.97 1792.84 321.295 245.430 535.979 1365.673
## 285 2007 1 0.974 382.93 1799.66 320.561 248.372 539.206 1365.717
## 302 2008 6 0.128 387.88 1791.80 321.447 244.676 535.128 1365.673
## Aerosols Temp .rownames .src .rnorm Temp.predict.All.X.glm
## 297 0.0038 0.074 297 Test 1.0852121 0.3652026
## 298 0.0036 0.198 298 Test -0.2598725 0.3464413
## 301 0.0031 0.283 301 Test -0.4882599 0.4108612
## 299 0.0034 0.447 299 Test -0.3150076 0.3229830
## 285 0.0054 0.601 285 Test 1.1545630 0.4821193
## 302 0.0031 0.315 302 Test 0.7576189 0.4241928
## Temp.predict.All.X.glm.err
## 297 0.2912026
## 298 0.1484413
## 301 0.1278612
## 299 0.1240170
## 285 0.1188807
## 302 0.1091928
predct_accurate_var_name <- paste0(glb_rsp_var_out, glb_sel_mdl_id, ".accurate")
glb_OOBobs_df[, predct_accurate_var_name] <-
(glb_OOBobs_df[, glb_rsp_var] ==
glb_OOBobs_df[, paste0(glb_rsp_var_out, glb_sel_mdl_id)])
#stop(here"); #sav_models_lst <- glb_models_lst; sav_models_df <- glb_models_df
glb_featsimp_df <-
myget_feats_importance(mdl=glb_sel_mdl, featsimp_df=NULL)
glb_featsimp_df[, paste0(glb_sel_mdl_id, ".importance")] <- glb_featsimp_df$importance
print(glb_featsimp_df)
## importance All.X.glm.importance
## MEI 100.0000000 100.0000000
## Aerosols 70.1835382 70.1835382
## TSI 49.5638130 49.5638130
## CFC.11 26.5708864 26.5708864
## CFC.12 18.6270814 18.6270814
## Month 14.4858252 14.4858252
## .rnorm 7.4102055 7.4102055
## N2O 5.8258553 5.8258553
## CO2 5.3550289 5.3550289
## CH4 0.5313048 0.5313048
## Year 0.0000000 0.0000000
# Used again in fit.data.training & predict.data.new chunks
glb_analytics_diag_plots <- function(obs_df, mdl_id, prob_threshold=NULL) {
featsimp_df <- glb_featsimp_df
featsimp_df$feat <- gsub("`(.*?)`", "\\1", row.names(featsimp_df))
featsimp_df$feat.interact <- gsub("(.*?):(.*)", "\\2", featsimp_df$feat)
featsimp_df$feat <- gsub("(.*?):(.*)", "\\1", featsimp_df$feat)
featsimp_df$feat.interact <- ifelse(featsimp_df$feat.interact == featsimp_df$feat,
NA, featsimp_df$feat.interact)
featsimp_df$feat <- gsub("(.*?)\\.fctr(.*)", "\\1\\.fctr", featsimp_df$feat)
featsimp_df$feat.interact <- gsub("(.*?)\\.fctr(.*)", "\\1\\.fctr", featsimp_df$feat.interact)
featsimp_df <- orderBy(~ -importance.max, summaryBy(importance ~ feat + feat.interact,
data=featsimp_df, FUN=max))
#rex_str=":(.*)"; txt_vctr=tail(featsimp_df$feat); ret_lst <- regexec(rex_str, txt_vctr); ret_lst <- regmatches(txt_vctr, ret_lst); ret_vctr <- sapply(1:length(ret_lst), function(pos_ix) ifelse(length(ret_lst[[pos_ix]]) > 0, ret_lst[[pos_ix]], "")); print(ret_vctr <- ret_vctr[ret_vctr != ""])
if (nrow(featsimp_df) > 5) {
warning("Limiting important feature scatter plots to 5 out of ", nrow(featsimp_df))
featsimp_df <- head(featsimp_df, 5)
}
# if (!all(is.na(featsimp_df$feat.interact)))
# stop("not implemented yet")
rsp_var_out <- paste0(glb_rsp_var_out, mdl_id)
for (var in featsimp_df$feat) {
plot_df <- melt(obs_df, id.vars=var,
measure.vars=c(glb_rsp_var, rsp_var_out))
# if (var == "<feat_name>") print(myplot_scatter(plot_df, var, "value",
# facet_colcol_name="variable") +
# geom_vline(xintercept=<divider_val>, linetype="dotted")) else
print(myplot_scatter(plot_df, var, "value", colorcol_name="variable",
facet_colcol_name="variable", jitter=TRUE) +
guides(color=FALSE))
}
if (glb_is_regression) {
if (nrow(featsimp_df) == 0)
warning("No important features in glb_fin_mdl") else
print(myplot_prediction_regression(df=obs_df,
feat_x=ifelse(nrow(featsimp_df) > 1, featsimp_df$feat[2],
".rownames"),
feat_y=featsimp_df$feat[1],
rsp_var=glb_rsp_var, rsp_var_out=rsp_var_out,
id_vars=glb_id_var)
# + facet_wrap(reformulate(featsimp_df$feat[2])) # if [1 or 2] is a factor
# + geom_point(aes_string(color="<col_name>.fctr")) # to color the plot
)
}
if (glb_is_classification) {
if (nrow(featsimp_df) == 0)
warning("No features in selected model are statistically important")
else print(myplot_prediction_classification(df=obs_df,
feat_x=ifelse(nrow(featsimp_df) > 1, featsimp_df$feat[2],
".rownames"),
feat_y=featsimp_df$feat[1],
rsp_var=glb_rsp_var,
rsp_var_out=rsp_var_out,
id_vars=glb_id_var,
prob_threshold=prob_threshold)
# + geom_hline(yintercept=<divider_val>, linetype = "dotted")
)
}
}
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df=glb_OOBobs_df, mdl_id=glb_sel_mdl_id,
prob_threshold=glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df=glb_OOBobs_df, mdl_id=glb_sel_mdl_id)
## Warning in glb_analytics_diag_plots(obs_df = glb_OOBobs_df, mdl_id =
## glb_sel_mdl_id): Limiting important feature scatter plots to 5 out of 11
## Year Month MEI CO2 CH4 N2O CFC.11 CFC.12 TSI
## 297 2008 1 -1.011 385.44 1809.92 321.328 246.183 536.876 1365.716
## 298 2008 2 -1.402 385.73 1803.45 321.345 245.898 536.484 1365.737
## 301 2008 5 -0.355 388.50 1796.43 321.420 244.914 535.399 1365.717
## 299 2008 3 -1.635 385.97 1792.84 321.295 245.430 535.979 1365.673
## 285 2007 1 0.974 382.93 1799.66 320.561 248.372 539.206 1365.717
## Aerosols Temp .rownames .src .rnorm Temp.predict.All.X.glm
## 297 0.0038 0.074 297 Test 1.0852121 0.3652026
## 298 0.0036 0.198 298 Test -0.2598725 0.3464413
## 301 0.0031 0.283 301 Test -0.4882599 0.4108612
## 299 0.0034 0.447 299 Test -0.3150076 0.3229830
## 285 0.0054 0.601 285 Test 1.1545630 0.4821193
## Temp.predict.All.X.glm.err Temp.predict.All.X.glm.accurate .label
## 297 0.2912026 FALSE 297
## 298 0.1484413 FALSE 298
## 301 0.1278612 FALSE 301
## 299 0.1240170 FALSE 299
## 285 0.1188807 FALSE 285
# gather predictions from models better than MFO.*
#mdl_id <- "Conditional.X.rf"
#mdl_id <- "Conditional.X.cp.0.rpart"
#mdl_id <- "Conditional.X.rpart"
# glb_OOBobs_df <- glb_get_predictions(df=glb_OOBobs_df, mdl_id,
# glb_rsp_var_out)
# print(t(confusionMatrix(glb_OOBobs_df[, paste0(glb_rsp_var_out, mdl_id)],
# glb_OOBobs_df[, glb_rsp_var])$table))
# FN_OOB_ids <- c(4721, 4020, 693, 92)
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
# grep(glb_rsp_var, names(glb_OOBobs_df), value=TRUE)])
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
# glb_feats_df$id[1:5]])
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
# glb_txt_vars])
write.csv(glb_OOBobs_df[, c(glb_id_var,
grep(glb_rsp_var, names(glb_OOBobs_df), fixed=TRUE, value=TRUE))],
paste0(gsub(".", "_", paste0(glb_out_pfx, glb_sel_mdl_id), fixed=TRUE),
"_OOBobs.csv"), row.names=FALSE)
# print(glb_allobs_df[glb_allobs_df$UniqueID %in% FN_OOB_ids,
# glb_txt_vars])
# dsp_tbl(Headline.contains="[Ee]bola")
# sum(sel_obs(Headline.contains="[Ee]bola"))
# ftable(xtabs(Popular ~ NewsDesk.fctr, data=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,]))
# xtabs(NewsDesk ~ Popular, #Popular ~ NewsDesk.fctr,
# data=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,],
# exclude=NULL)
# print(mycreate_xtab_df(df=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,], c("Popular", "NewsDesk", "SectionName", "SubsectionName")))
# print(mycreate_tbl_df(df=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,], c("Popular", "NewsDesk", "SectionName", "SubsectionName")))
# print(mycreate_tbl_df(df=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,], c("Popular")))
# print(mycreate_tbl_df(df=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,],
# tbl_col_names=c("Popular", "NewsDesk")))
# write.csv(glb_chunks_df, paste0(glb_out_pfx, tail(glb_chunks_df, 1)$label, "_",
# tail(glb_chunks_df, 1)$step_minor, "_chunks1.csv"),
# row.names=FALSE)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 12 fit.models 7 2 53.136 63.042 9.906
## 13 fit.models 7 3 63.042 NA NA
print(setdiff(names(glb_trnobs_df), names(glb_allobs_df)))
## character(0)
print(setdiff(names(glb_fitobs_df), names(glb_allobs_df)))
## character(0)
print(setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
## [1] "Temp.predict.All.X.glm" "Temp.predict.All.X.glm.err"
## [3] "Temp.predict.All.X.glm.accurate"
for (col in setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.lcn == "OOB", col] <- glb_OOBobs_df[, col]
print(setdiff(names(glb_newobs_df), names(glb_allobs_df)))
## character(0)
if (glb_save_envir)
save(glb_feats_df,
glb_allobs_df, #glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
glb_models_df, dsp_models_df, glb_models_lst, glb_sel_mdl, glb_sel_mdl_id,
glb_model_type,
file=paste0(glb_out_pfx, "selmdl_dsk.RData"))
#load(paste0(glb_out_pfx, "selmdl_dsk.RData"))
rm(ret_lst)
## Warning in rm(ret_lst): object 'ret_lst' not found
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"model.selected")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 13 fit.models 7 3 63.042 66.933 3.891
## 14 fit.data.training 8 0 66.934 NA NA
8.0: fit data training#load(paste0(glb_inp_pfx, "dsk.RData"))
# To create specific models
# glb_fin_mdl_id <- NULL; glb_fin_mdl <- NULL;
# glb_sel_mdl_id <- "Conditional.X.cp.0.rpart";
# glb_sel_mdl <- glb_models_lst[[glb_sel_mdl_id]]; print(glb_sel_mdl)
if (!is.null(glb_fin_mdl_id) && (glb_fin_mdl_id %in% names(glb_models_lst))) {
warning("Final model same as user selected model")
glb_fin_mdl <- glb_sel_mdl
} else {
# print(mdl_feats_df <- myextract_mdl_feats(sel_mdl=glb_sel_mdl,
# entity_df=glb_fitobs_df))
if ((model_method <- glb_sel_mdl$method) == "custom")
# get actual method from the model_id
model_method <- tail(unlist(strsplit(glb_sel_mdl_id, "[.]")), 1)
tune_finmdl_df <- NULL
if (nrow(glb_sel_mdl$bestTune) > 0) {
for (param in names(glb_sel_mdl$bestTune)) {
#print(sprintf("param: %s", param))
if (glb_sel_mdl$bestTune[1, param] != "none")
tune_finmdl_df <- rbind(tune_finmdl_df,
data.frame(parameter=param,
min=glb_sel_mdl$bestTune[1, param],
max=glb_sel_mdl$bestTune[1, param],
by=1)) # by val does not matter
}
}
# Sync with parameters in mydsutils.R
require(gdata)
ret_lst <- myfit_mdl(model_id="Final", model_method=model_method,
indep_vars_vctr=trim(unlist(strsplit(glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"feats"], "[,]"))),
model_type=glb_model_type,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_trnobs_df, OOB_df=NULL,
n_cv_folds=glb_n_cv_folds, tune_models_df=tune_finmdl_df,
# Automate from here
# Issues if glb_sel_mdl$method == "rf" b/c trainControl is "oob"; not "cv"
model_loss_mtrx=glb_model_metric_terms,
model_summaryFunction=glb_sel_mdl$control$summaryFunction,
model_metric=glb_sel_mdl$metric,
model_metric_maximize=glb_sel_mdl$maximize)
glb_fin_mdl <- glb_models_lst[[length(glb_models_lst)]]
glb_fin_mdl_id <- glb_models_df[length(glb_models_lst), "model_id"]
}
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
##
## The following object is masked from 'package:randomForest':
##
## combine
##
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
##
## The following object is masked from 'package:stats':
##
## nobs
##
## The following object is masked from 'package:utils':
##
## object.size
## [1] "fitting model: Final.glm"
## [1] " indep_vars: CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, .rnorm, Month, Aerosols"
## Aggregating results
## Fitting final model on full training set
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.27254 -0.05743 -0.00018 0.05243 0.32473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.376e+02 5.560e+01 -2.475 0.01392 *
## CO2 2.655e-03 3.247e-03 0.818 0.41422
## Year 6.354e-03 2.121e-02 0.300 0.76471
## N2O -1.594e-02 1.846e-02 -0.863 0.38873
## CH4 1.868e-04 5.323e-04 0.351 0.72585
## CFC.12 3.683e-03 1.752e-03 2.102 0.03648 *
## CFC.11 -6.653e-03 2.318e-03 -2.871 0.00442 **
## TSI 9.427e-02 1.850e-02 5.095 6.52e-07 ***
## MEI 6.471e-02 6.487e-03 9.975 < 2e-16 ***
## .rnorm -6.153e-03 6.053e-03 -1.017 0.31025
## Month -3.706e-03 2.178e-03 -1.701 0.09004 .
## Aerosols -1.568e+00 2.211e-01 -7.090 1.15e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.008333862)
##
## Null deviance: 9.2853 on 283 degrees of freedom
## Residual deviance: 2.2668 on 272 degrees of freedom
## AIC: -539.93
##
## Number of Fisher Scoring iterations: 2
##
## [1] " calling mypredict_mdl for fit:"
## model_id model_method
## 1 Final.glm glm
## feats
## 1 CO2, Year, N2O, CH4, CFC.12, CFC.11, TSI, MEI, .rnorm, Month, Aerosols
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 0.95 0.016
## max.R.sq.fit min.RMSE.fit min.aic.fit max.Rsquared.fit min.RMSESD.fit
## 1 0.7558699 0.09468322 -539.9334 0.7270083 0.008506479
## max.RsquaredSD.fit
## 1 0.04795456
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 14 fit.data.training 8 0 66.934 71.268 4.334
## 15 fit.data.training 8 1 71.269 NA NA
glb_trnobs_df <- glb_get_predictions(df=glb_trnobs_df, mdl_id=glb_fin_mdl_id,
rsp_var_out=glb_rsp_var_out,
prob_threshold_def=ifelse(glb_is_classification && glb_is_binomial,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id, "opt.prob.threshold.OOB"], NULL))
## Year Month MEI CO2 CH4 N2O CFC.11 CFC.12 TSI
## 184 1998 8 -0.410 365.79 1757.06 313.619 265.784 536.489 1366.086
## 183 1998 7 0.285 367.74 1762.46 313.542 265.787 536.151 1366.244
## 130 1994 2 0.192 358.98 1752.06 310.144 270.900 515.387 1365.988
## 190 1999 2 -1.238 368.98 1784.64 314.571 265.701 538.971 1366.239
## 178 1998 2 2.777 365.98 1766.01 313.512 266.978 535.561 1365.917
## 20 1984 12 -0.611 344.54 1672.15 305.313 207.308 372.701 1365.762
## Aerosols Temp .rownames .src .rnorm Temp.predict.Final.glm
## 184 0.0024 0.616 184 Train -0.64038261 0.29127028
## 183 0.0025 0.651 183 Train 0.51151537 0.35378571
## 130 0.0265 -0.072 130 Train 0.03471289 0.20053762
## 190 0.0022 0.540 190 Train -0.15937021 0.28624629
## 178 0.0037 0.739 178 Train -0.10478436 0.49095525
## 20 0.0188 -0.282 20 Train 0.80875087 -0.04480836
## Temp.predict.Final.glm.err
## 184 0.3247297
## 183 0.2972143
## 130 0.2725376
## 190 0.2537537
## 178 0.2480447
## 20 0.2371916
sav_featsimp_df <- glb_featsimp_df
#glb_feats_df <- sav_feats_df
# glb_feats_df <- mymerge_feats_importance(feats_df=glb_feats_df, sel_mdl=glb_fin_mdl,
# entity_df=glb_trnobs_df)
glb_featsimp_df <- myget_feats_importance(mdl=glb_fin_mdl, featsimp_df=glb_featsimp_df)
glb_featsimp_df[, paste0(glb_fin_mdl_id, ".importance")] <- glb_featsimp_df$importance
print(glb_featsimp_df)
## All.X.glm.importance importance Final.glm.importance
## MEI 100.0000000 100.0000000 100.0000000
## Aerosols 70.1835382 70.1835382 70.1835382
## TSI 49.5638130 49.5638130 49.5638130
## CFC.11 26.5708864 26.5708864 26.5708864
## CFC.12 18.6270814 18.6270814 18.6270814
## Month 14.4858252 14.4858252 14.4858252
## .rnorm 7.4102055 7.4102055 7.4102055
## N2O 5.8258553 5.8258553 5.8258553
## CO2 5.3550289 5.3550289 5.3550289
## CH4 0.5313048 0.5313048 0.5313048
## Year 0.0000000 0.0000000 0.0000000
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df=glb_trnobs_df, mdl_id=glb_fin_mdl_id,
prob_threshold=glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df=glb_trnobs_df, mdl_id=glb_fin_mdl_id)
## Warning in glb_analytics_diag_plots(obs_df = glb_trnobs_df, mdl_id =
## glb_fin_mdl_id): Limiting important feature scatter plots to 5 out of 11
## Year Month MEI CO2 CH4 N2O CFC.11 CFC.12 TSI
## 184 1998 8 -0.410 365.79 1757.06 313.619 265.784 536.489 1366.086
## 183 1998 7 0.285 367.74 1762.46 313.542 265.787 536.151 1366.244
## 130 1994 2 0.192 358.98 1752.06 310.144 270.900 515.387 1365.988
## 190 1999 2 -1.238 368.98 1784.64 314.571 265.701 538.971 1366.239
## 178 1998 2 2.777 365.98 1766.01 313.512 266.978 535.561 1365.917
## Aerosols Temp .rownames .src .rnorm Temp.predict.Final.glm
## 184 0.0024 0.616 184 Train -0.64038261 0.2912703
## 183 0.0025 0.651 183 Train 0.51151537 0.3537857
## 130 0.0265 -0.072 130 Train 0.03471289 0.2005376
## 190 0.0022 0.540 190 Train -0.15937021 0.2862463
## 178 0.0037 0.739 178 Train -0.10478436 0.4909553
## Temp.predict.Final.glm.err .label
## 184 0.3247297 184
## 183 0.2972143 183
## 130 0.2725376 130
## 190 0.2537537 190
## 178 0.2480447 178
dsp_feats_vctr <- c(NULL)
for(var in grep(".importance", names(glb_feats_df), fixed=TRUE, value=TRUE))
dsp_feats_vctr <- union(dsp_feats_vctr,
glb_feats_df[!is.na(glb_feats_df[, var]), "id"])
# print(glb_trnobs_df[glb_trnobs_df$UniqueID %in% FN_OOB_ids,
# grep(glb_rsp_var, names(glb_trnobs_df), value=TRUE)])
print(setdiff(names(glb_trnobs_df), names(glb_allobs_df)))
## [1] "Temp.predict.Final.glm" "Temp.predict.Final.glm.err"
for (col in setdiff(names(glb_trnobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.src == "Train", col] <- glb_trnobs_df[, col]
print(setdiff(names(glb_fitobs_df), names(glb_allobs_df)))
## character(0)
print(setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
## character(0)
for (col in setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.lcn == "OOB", col] <- glb_OOBobs_df[, col]
print(setdiff(names(glb_newobs_df), names(glb_allobs_df)))
## character(0)
if (glb_save_envir)
save(glb_feats_df, glb_allobs_df,
#glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
glb_models_df, dsp_models_df, glb_models_lst, glb_model_type,
glb_sel_mdl, glb_sel_mdl_id,
glb_fin_mdl, glb_fin_mdl_id,
file=paste0(glb_out_pfx, "dsk.RData"))
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all.prediction","model.final")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
## 3.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: data.training.all.prediction
## 4.0000 5 0 1 1 1
## 4.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: model.final
## 5.0000 4 0 0 2 1
glb_chunks_df <- myadd_chunk(glb_chunks_df, "predict.data.new", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 15 fit.data.training 8 1 71.269 75.616 4.347
## 16 predict.data.new 9 0 75.616 NA NA
9.0: predict data new# Compute final model predictions
glb_newobs_df <- glb_get_predictions(glb_newobs_df, mdl_id=glb_fin_mdl_id,
rsp_var_out=glb_rsp_var_out,
prob_threshold_def=ifelse(glb_is_classification && glb_is_binomial,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"], NULL))
## Year Month MEI CO2 CH4 N2O CFC.11 CFC.12 TSI
## 297 2008 1 -1.011 385.44 1809.92 321.328 246.183 536.876 1365.716
## 298 2008 2 -1.402 385.73 1803.45 321.345 245.898 536.484 1365.737
## 301 2008 5 -0.355 388.50 1796.43 321.420 244.914 535.399 1365.717
## 299 2008 3 -1.635 385.97 1792.84 321.295 245.430 535.979 1365.673
## 285 2007 1 0.974 382.93 1799.66 320.561 248.372 539.206 1365.717
## 302 2008 6 0.128 387.88 1791.80 321.447 244.676 535.128 1365.673
## Aerosols Temp .rownames .src .rnorm Temp.predict.Final.glm
## 297 0.0038 0.074 297 Test 1.0852121 0.3652026
## 298 0.0036 0.198 298 Test -0.2598725 0.3464413
## 301 0.0031 0.283 301 Test -0.4882599 0.4108612
## 299 0.0034 0.447 299 Test -0.3150076 0.3229830
## 285 0.0054 0.601 285 Test 1.1545630 0.4821193
## 302 0.0031 0.315 302 Test 0.7576189 0.4241928
## Temp.predict.Final.glm.err
## 297 0.2912026
## 298 0.1484413
## 301 0.1278612
## 299 0.1240170
## 285 0.1188807
## 302 0.1091928
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df=glb_newobs_df, mdl_id=glb_fin_mdl_id,
prob_threshold=glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df=glb_newobs_df, mdl_id=glb_fin_mdl_id)
## Warning in glb_analytics_diag_plots(obs_df = glb_newobs_df, mdl_id =
## glb_fin_mdl_id): Limiting important feature scatter plots to 5 out of 11
## Year Month MEI CO2 CH4 N2O CFC.11 CFC.12 TSI
## 297 2008 1 -1.011 385.44 1809.92 321.328 246.183 536.876 1365.716
## 298 2008 2 -1.402 385.73 1803.45 321.345 245.898 536.484 1365.737
## 301 2008 5 -0.355 388.50 1796.43 321.420 244.914 535.399 1365.717
## 299 2008 3 -1.635 385.97 1792.84 321.295 245.430 535.979 1365.673
## 285 2007 1 0.974 382.93 1799.66 320.561 248.372 539.206 1365.717
## Aerosols Temp .rownames .src .rnorm Temp.predict.Final.glm
## 297 0.0038 0.074 297 Test 1.0852121 0.3652026
## 298 0.0036 0.198 298 Test -0.2598725 0.3464413
## 301 0.0031 0.283 301 Test -0.4882599 0.4108612
## 299 0.0034 0.447 299 Test -0.3150076 0.3229830
## 285 0.0054 0.601 285 Test 1.1545630 0.4821193
## Temp.predict.Final.glm.err .label
## 297 0.2912026 297
## 298 0.1484413 298
## 301 0.1278612 301
## 299 0.1240170 299
## 285 0.1188807 285
if (glb_is_classification && glb_is_binomial) {
submit_df <- glb_newobs_df[, c(glb_id_var,
paste0(glb_rsp_var_out, glb_fin_mdl_id, ".prob"))]
names(submit_df)[2] <- "Probability1"
# submit_df <- glb_newobs_df[, c(paste0(glb_rsp_var_out, glb_fin_mdl_id)), FALSE]
# names(submit_df)[1] <- "BDscience"
# submit_df$BDscience <- as.numeric(submit_df$BDscience) - 1
# #submit_df <-rbind(submit_df, data.frame(bdanalytics=c(" ")))
# print("Submission Stats:")
# print(table(submit_df$BDscience, useNA = "ifany"))
} else submit_df <- glb_newobs_df[, c(glb_id_var,
paste0(glb_rsp_var_out, glb_fin_mdl_id))]
submit_fname <- paste0(gsub(".", "_", paste0(glb_out_pfx, glb_fin_mdl_id), fixed=TRUE),
"_submit.csv")
write.csv(submit_df, submit_fname, quote=FALSE, row.names=FALSE)
#cat(" ", "\n", file=submit_fn, append=TRUE)
# print(orderBy(~ -max.auc.OOB, glb_models_df[, c("model_id",
# "max.auc.OOB", "max.Accuracy.OOB")]))
if (glb_is_classification && glb_is_binomial)
print(glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"])
print(sprintf("glb_sel_mdl_id: %s", glb_sel_mdl_id))
## [1] "glb_sel_mdl_id: All.X.glm"
print(sprintf("glb_fin_mdl_id: %s", glb_fin_mdl_id))
## [1] "glb_fin_mdl_id: Final.glm"
print(dim(glb_fitobs_df))
## [1] 284 14
print(dsp_models_df)
## model_id min.RMSE.OOB max.R.sq.OOB max.Adj.R.sq.fit
## 9 All.X.glm 0.09332638 0.64329559 NA
## 8 All.X.lm 0.09332638 0.64329559 0.745996970
## 10 All.X.bayesglm 0.09371637 0.64030820 NA
## 4 Max.cor.Y.rpart 0.11498010 0.45856660 NA
## 3 Max.cor.Y.cv.0.cp.0.rpart 0.12020016 0.40828886 NA
## 11 All.X.no.rnorm.rpart 0.12055493 0.40479089 NA
## 7 Low.cor.X.lm 0.12388613 0.37144246 0.728256707
## 12 All.X.no.rnorm.rf 0.12451600 0.36588961 NA
## 2 Max.cor.Y.cv.0.rpart 0.15626085 0.00000000 NA
## 1 MFO.lm 0.15776912 -0.01939772 -0.001848536
## 5 Max.cor.Y.lm 0.20849935 -0.78036504 0.619093963
## 6 Interact.High.cor.Y.lm 0.20919800 -0.79231656 0.620483908
if (glb_is_regression) {
print(sprintf("%s OOB RMSE: %0.4f", glb_sel_mdl_id,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id, "min.RMSE.OOB"]))
if (!is.null(glb_category_vars)) {
stop("not implemented yet")
tmp_OOBobs_df <- glb_OOBobs_df[, c(glb_category_vars, predct_accurate_var_name)]
names(tmp_OOBobs_df)[length(names(tmp_OOBobs_df))] <- "accurate.OOB"
aOOB_ctgry_df <- mycreate_xtab_df(tmp_OOBobs_df, names(tmp_OOBobs_df))
aOOB_ctgry_df[is.na(aOOB_ctgry_df)] <- 0
aOOB_ctgry_df <- mutate(aOOB_ctgry_df,
.n.OOB = accurate.OOB.FALSE + accurate.OOB.TRUE,
max.accuracy.OOB = accurate.OOB.TRUE / .n.OOB)
#intersect(names(glb_ctgry_df), names(aOOB_ctgry_df))
glb_ctgry_df <- merge(glb_ctgry_df, aOOB_ctgry_df, all=TRUE)
print(orderBy(~-accurate.OOB.FALSE, glb_ctgry_df))
}
if ((glb_rsp_var %in% names(glb_newobs_df)) &&
!(any(is.na(glb_newobs_df[, glb_rsp_var])))) {
pred_stats_df <-
mypredict_mdl(mdl=glb_models_lst[[glb_fin_mdl_id]],
df=glb_newobs_df,
rsp_var=glb_rsp_var,
rsp_var_out=glb_rsp_var_out,
model_id_method=glb_fin_mdl_id,
label="new",
model_summaryFunction=glb_sel_mdl$control$summaryFunction,
model_metric=glb_sel_mdl$metric,
model_metric_maximize=glb_sel_mdl$maximize,
ret_type="stats")
print(sprintf("%s prediction stats for glb_newobs_df:", glb_fin_mdl_id))
print(pred_stats_df)
}
}
## [1] "All.X.glm OOB RMSE: 0.0933"
## [1] "Final.glm prediction stats for glb_newobs_df:"
## model_id max.R.sq.new min.RMSE.new
## 1 Final.glm 0.6432956 0.09332638
if (glb_is_classification) {
print(sprintf("%s OOB confusion matrix & accuracy: ", glb_sel_mdl_id))
print(t(confusionMatrix(glb_OOBobs_df[, paste0(glb_rsp_var_out, glb_sel_mdl_id)],
glb_OOBobs_df[, glb_rsp_var])$table))
if (!is.null(glb_category_vars)) {
tmp_OOBobs_df <- glb_OOBobs_df[, c(glb_category_vars, predct_accurate_var_name)]
names(tmp_OOBobs_df)[length(names(tmp_OOBobs_df))] <- "accurate.OOB"
aOOB_ctgry_df <- mycreate_xtab_df(tmp_OOBobs_df, names(tmp_OOBobs_df))
aOOB_ctgry_df[is.na(aOOB_ctgry_df)] <- 0
aOOB_ctgry_df <- mutate(aOOB_ctgry_df,
.n.OOB = accurate.OOB.FALSE + accurate.OOB.TRUE,
max.accuracy.OOB = accurate.OOB.TRUE / .n.OOB)
#intersect(names(glb_ctgry_df), names(aOOB_ctgry_df))
glb_ctgry_df <- merge(glb_ctgry_df, aOOB_ctgry_df, all=TRUE)
print(orderBy(~-accurate.OOB.FALSE, glb_ctgry_df))
}
if ((glb_rsp_var %in% names(glb_newobs_df)) &&
!(any(is.na(glb_newobs_df[, glb_rsp_var])))) {
print(sprintf("%s new confusion matrix & accuracy: ", glb_fin_mdl_id))
print(t(confusionMatrix(glb_newobs_df[, paste0(glb_rsp_var_out, glb_fin_mdl_id)],
glb_newobs_df[, glb_rsp_var])$table))
}
}
dsp_myCategory_conf_mtrx <- function(myCategory) {
print(sprintf("%s OOB::myCategory=%s confusion matrix & accuracy: ",
glb_sel_mdl_id, myCategory))
print(t(confusionMatrix(
glb_OOBobs_df[glb_OOBobs_df$myCategory == myCategory,
paste0(glb_rsp_var_out, glb_sel_mdl_id)],
glb_OOBobs_df[glb_OOBobs_df$myCategory == myCategory, glb_rsp_var])$table))
print(sum(glb_OOBobs_df[glb_OOBobs_df$myCategory == myCategory,
predct_accurate_var_name]) /
nrow(glb_OOBobs_df[glb_OOBobs_df$myCategory == myCategory, ]))
err_ids <- glb_OOBobs_df[(glb_OOBobs_df$myCategory == myCategory) &
(!glb_OOBobs_df[, predct_accurate_var_name]), glb_id_var]
OOB_FNerr_df <- glb_OOBobs_df[(glb_OOBobs_df$UniqueID %in% err_ids) &
(glb_OOBobs_df$Popular == 1),
c(
".clusterid",
"Popular", "Headline", "Snippet", "Abstract")]
print(sprintf("%s OOB::myCategory=%s FN errors: %d", glb_sel_mdl_id, myCategory,
nrow(OOB_FNerr_df)))
print(OOB_FNerr_df)
OOB_FPerr_df <- glb_OOBobs_df[(glb_OOBobs_df$UniqueID %in% err_ids) &
(glb_OOBobs_df$Popular == 0),
c(
".clusterid",
"Popular", "Headline", "Snippet", "Abstract")]
print(sprintf("%s OOB::myCategory=%s FP errors: %d", glb_sel_mdl_id, myCategory,
nrow(OOB_FPerr_df)))
print(OOB_FPerr_df)
}
#dsp_myCategory_conf_mtrx(myCategory="OpEd#Opinion#")
#dsp_myCategory_conf_mtrx(myCategory="Business#Business Day#Dealbook")
#dsp_myCategory_conf_mtrx(myCategory="##")
# if (glb_is_classification) {
# print("FN_OOB_ids:")
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
# grep(glb_rsp_var, names(glb_OOBobs_df), value=TRUE)])
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
# glb_txt_vars])
# print(dsp_vctr <- colSums(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
# setdiff(grep("[HSA].", names(glb_OOBobs_df), value=TRUE),
# union(myfind_chr_cols_df(glb_OOBobs_df),
# grep(".fctr", names(glb_OOBobs_df), fixed=TRUE, value=TRUE)))]))
# }
dsp_hdlpfx_results <- function(hdlpfx) {
print(hdlpfx)
print(glb_OOBobs_df[glb_OOBobs_df$Headline.pfx %in% c(hdlpfx),
grep(glb_rsp_var, names(glb_OOBobs_df), value=TRUE)])
print(glb_newobs_df[glb_newobs_df$Headline.pfx %in% c(hdlpfx),
grep(glb_rsp_var, names(glb_newobs_df), value=TRUE)])
print(dsp_vctr <- colSums(glb_newobs_df[glb_newobs_df$Headline.pfx %in% c(hdlpfx),
setdiff(grep("[HSA]\\.", names(glb_newobs_df), value=TRUE),
union(myfind_chr_cols_df(glb_newobs_df),
grep(".fctr", names(glb_newobs_df), fixed=TRUE, value=TRUE)))]))
print(dsp_vctr <- dsp_vctr[dsp_vctr != 0])
print(glb_newobs_df[glb_newobs_df$Headline.pfx %in% c(hdlpfx),
union(names(dsp_vctr), myfind_chr_cols_df(glb_newobs_df))])
}
#dsp_hdlpfx_results(hdlpfx="Ask Well::")
# print("myMisc::|OpEd|blank|blank|1:")
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% c(6446),
# grep(glb_rsp_var, names(glb_OOBobs_df), value=TRUE)])
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
# c("WordCount", "WordCount.log", "myMultimedia",
# "NewsDesk", "SectionName", "SubsectionName")])
# print(mycreate_sqlxtab_df(glb_allobs_df[sel_obs(Headline.contains="[Vv]ideo"), ],
# c(glb_rsp_var, "myMultimedia")))
# dsp_chisq.test(Headline.contains="[Vi]deo")
# print(glb_allobs_df[sel_obs(Headline.contains="[Vv]ideo"),
# c(glb_rsp_var, "Popular", "myMultimedia", "Headline")])
# print(glb_allobs_df[sel_obs(Headline.contains="[Ee]bola", Popular=1),
# c(glb_rsp_var, "Popular", "myMultimedia", "Headline",
# "NewsDesk", "SectionName", "SubsectionName")])
# print(subset(glb_feats_df, !is.na(importance))[,
# c("is.ConditionalX.y",
# grep("importance", names(glb_feats_df), fixed=TRUE, value=TRUE))])
# print(subset(glb_feats_df, is.ConditionalX.y & is.na(importance))[,
# c("is.ConditionalX.y",
# grep("importance", names(glb_feats_df), fixed=TRUE, value=TRUE))])
# print(subset(glb_feats_df, !is.na(importance))[,
# c("zeroVar", "nzv", "myNearZV",
# grep("importance", names(glb_feats_df), fixed=TRUE, value=TRUE))])
# print(subset(glb_feats_df, is.na(importance))[,
# c("zeroVar", "nzv", "myNearZV",
# grep("importance", names(glb_feats_df), fixed=TRUE, value=TRUE))])
print(orderBy(as.formula(paste0("~ -", glb_sel_mdl_id, ".importance")), glb_featsimp_df))
## All.X.glm.importance importance Final.glm.importance
## MEI 100.0000000 100.0000000 100.0000000
## Aerosols 70.1835382 70.1835382 70.1835382
## TSI 49.5638130 49.5638130 49.5638130
## CFC.11 26.5708864 26.5708864 26.5708864
## CFC.12 18.6270814 18.6270814 18.6270814
## Month 14.4858252 14.4858252 14.4858252
## .rnorm 7.4102055 7.4102055 7.4102055
## N2O 5.8258553 5.8258553 5.8258553
## CO2 5.3550289 5.3550289 5.3550289
## CH4 0.5313048 0.5313048 0.5313048
## Year 0.0000000 0.0000000 0.0000000
# players_df <- data.frame(id=c("Chavez", "Giambi", "Menechino", "Myers", "Pena"),
# OBP=c(0.338, 0.391, 0.369, 0.313, 0.361),
# SLG=c(0.540, 0.450, 0.374, 0.447, 0.500),
# cost=c(1400000, 1065000, 295000, 800000, 300000))
# players_df$RS.predict <- predict(glb_models_lst[[csm_mdl_id]], players_df)
# print(orderBy(~ -RS.predict, players_df))
if (length(diff <- setdiff(names(glb_trnobs_df), names(glb_allobs_df))) > 0)
print(diff)
for (col in setdiff(names(glb_trnobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.src == "Train", col] <- glb_trnobs_df[, col]
if (length(diff <- setdiff(names(glb_fitobs_df), names(glb_allobs_df))) > 0)
print(diff)
if (length(diff <- setdiff(names(glb_OOBobs_df), names(glb_allobs_df))) > 0)
print(diff)
for (col in setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.lcn == "OOB", col] <- glb_OOBobs_df[, col]
if (length(diff <- setdiff(names(glb_newobs_df), names(glb_allobs_df))) > 0)
print(diff)
if (glb_save_envir)
save(glb_feats_df, glb_allobs_df,
#glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
glb_models_df, dsp_models_df, glb_models_lst, glb_model_type,
glb_sel_mdl, glb_sel_mdl_id,
glb_fin_mdl, glb_fin_mdl_id,
file=paste0(glb_out_pfx, "prdnew_dsk.RData"))
rm(submit_df, tmp_OOBobs_df)
## Warning in rm(submit_df, tmp_OOBobs_df): object 'tmp_OOBobs_df' not found
# tmp_replay_lst <- replay.petrisim(pn=glb_analytics_pn,
# replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
# "data.new.prediction")), flip_coord=TRUE)
# print(ggplot.petrinet(tmp_replay_lst[["pn"]]) + coord_flip())
glb_chunks_df <- myadd_chunk(glb_chunks_df, "display.session.info", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 16 predict.data.new 9 0 75.616 79.426 3.81
## 17 display.session.info 10 0 79.426 NA NA
Null Hypothesis (\(\sf{H_{0}}\)): mpg is not impacted by am_fctr.
The variance by am_fctr appears to be independent. #{r q1, cache=FALSE} # print(t.test(subset(cars_df, am_fctr == "automatic")$mpg, # subset(cars_df, am_fctr == "manual")$mpg, # var.equal=FALSE)$conf) # We reject the null hypothesis i.e. we have evidence to conclude that am_fctr impacts mpg (95% confidence). Manual transmission is better for miles per gallon versus automatic transmission.
## label step_major step_minor bgn end elapsed
## 11 fit.models 7 1 34.275 53.135 18.860
## 10 fit.models 7 0 19.130 34.274 15.144
## 12 fit.models 7 2 53.136 63.042 9.906
## 2 inspect.data 2 0 9.896 14.970 5.074
## 15 fit.data.training 8 1 71.269 75.616 4.347
## 14 fit.data.training 8 0 66.934 71.268 4.334
## 13 fit.models 7 3 63.042 66.933 3.891
## 16 predict.data.new 9 0 75.616 79.426 3.810
## 3 scrub.data 2 1 14.971 16.485 1.514
## 6 extract.features 3 0 16.583 17.753 1.170
## 8 select.features 5 0 18.034 18.850 0.817
## 1 import.data 1 0 9.500 9.896 0.396
## 7 cluster.data 4 0 17.753 18.033 0.280
## 9 partition.data.training 6 0 18.851 19.130 0.279
## 5 manage.missing.data 2 3 16.518 16.583 0.065
## 4 transform.data 2 2 16.486 16.518 0.032
## duration
## 11 18.860
## 10 15.144
## 12 9.906
## 2 5.074
## 15 4.347
## 14 4.334
## 13 3.891
## 16 3.810
## 3 1.514
## 6 1.170
## 8 0.816
## 1 0.396
## 7 0.280
## 9 0.279
## 5 0.065
## 4 0.032
## [1] "Total Elapsed Time: 79.426 secs"
## R version 3.2.0 (2015-04-16)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.3 (Yosemite)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] gdata_2.16.1 randomForest_4.6-10 arm_1.8-5
## [4] lme4_1.1-7 Rcpp_0.11.6 Matrix_1.2-1
## [7] MASS_7.3-40 rpart.plot_1.5.2 rpart_4.1-9
## [10] reshape2_1.4.1 dplyr_0.4.1 plyr_1.8.2
## [13] doMC_1.3.3 iterators_1.0.7 foreach_1.4.2
## [16] doBy_4.5-13 survival_2.38-1 caret_6.0-47
## [19] ggplot2_1.0.1 lattice_0.20-31
##
## loaded via a namespace (and not attached):
## [1] compiler_3.2.0 RColorBrewer_1.1-2 formatR_1.2
## [4] nloptr_1.0.4 tools_3.2.0 digest_0.6.8
## [7] evaluate_0.7 nlme_3.1-120 gtable_0.1.2
## [10] mgcv_1.8-6 DBI_0.3.1 yaml_2.1.13
## [13] brglm_0.5-9 SparseM_1.6 proto_0.3-10
## [16] coda_0.17-1 BradleyTerry2_1.0-6 stringr_1.0.0
## [19] knitr_1.10.5 gtools_3.5.0 nnet_7.3-9
## [22] rmarkdown_0.6.1 minqa_1.2.4 car_2.0-25
## [25] magrittr_1.5 scales_0.2.4 codetools_0.2-11
## [28] htmltools_0.2.6 splines_3.2.0 abind_1.4-3
## [31] assertthat_0.1 pbkrtest_0.4-2 colorspace_1.2-6
## [34] labeling_0.3 quantreg_5.11 stringi_0.4-1
## [37] lazyeval_0.1.10 munsell_0.4.2